[llvm-branch-commits] [llvm] [amdgpu-cfi: 6/9]: [AMDGPU] Use register pair for PC spill (PR #183146)

Scott Linder via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed May 20 16:46:54 PDT 2026


https://github.com/slinder1 updated https://github.com/llvm/llvm-project/pull/183146

>From fbfe7b44c4044284baea20aaa59521268c70f553 Mon Sep 17 00:00:00 2001
From: Scott Linder <Scott.Linder at amd.com>
Date: Wed, 29 Oct 2025 18:46:12 +0000
Subject: [PATCH] [AMDGPU] Use register pair for PC spill

Change-Id: Ibedeef926f7ff235a06de65a83087c151f66a416
---
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  |   21 +
 .../CodeGen/AMDGPU/GlobalISel/assert-align.ll |    8 +-
 .../GlobalISel/call-outgoing-stack-args.ll    |    8 +-
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |    2 +-
 .../abi-attribute-hints-undefined-behavior.ll |    4 +-
 .../CodeGen/AMDGPU/amdgcn-call-whole-wave.ll  |    8 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 8662 ++++++++---------
 .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll   |  226 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll   |   51 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll    |   26 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll   |  123 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll   |   51 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll   |  226 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll   |   51 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll   |  266 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll    |  170 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll   |   51 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll   |  123 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll   |  122 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll    |   26 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   | 3122 +++---
 .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll   |  186 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll   |  858 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll    |  226 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll   | 1650 ++--
 .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll   | 2058 ++--
 .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll   | 2474 ++---
 .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll   | 2922 +++---
 .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll   | 3482 +++----
 .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll    |  122 +-
 .../test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll |    4 +-
 .../amdgpu-simplify-libcall-pow-codegen.ll    |  322 +-
 ...tor-flatscratchinit-undefined-behavior2.ll |   15 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  604 +-
 .../test/CodeGen/AMDGPU/branch-relax-spill.ll |  156 +-
 .../CodeGen/AMDGPU/call-args-inreg-bfloat.ll  |    8 +-
 .../call-args-inreg-no-sgpr-for-csrspill.ll   |    8 +-
 llvm/test/CodeGen/AMDGPU/call-args-inreg.ll   |   88 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |   40 +-
 .../AMDGPU/call-graph-register-usage.ll       |    2 +-
 .../AMDGPU/call-preserved-registers.ll        |  116 +-
 llvm/test/CodeGen/AMDGPU/call-skip.ll         |    2 +-
 .../test/CodeGen/AMDGPU/callee-frame-setup.ll |  106 +-
 .../callee-special-input-vgprs-packed.ll      |   14 +-
 .../AMDGPU/callee-special-input-vgprs.ll      |   14 +-
 llvm/test/CodeGen/AMDGPU/cc-entry.ll          |    2 +-
 .../AMDGPU/cc-inreg-sgpr0-3-mismatch.ll       |   10 +-
 .../AMDGPU/copysign-simplify-demanded-bits.ll |   75 +-
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |   10 +-
 llvm/test/CodeGen/AMDGPU/debug-frame.ll       |    8 +-
 .../AMDGPU/dwarf-multi-register-use-crash.ll  |   67 +-
 .../dynamic-vgpr-reserve-stack-for-cwsr.ll    |    4 +-
 .../AMDGPU/eliminate-frame-index-select.ll    |   60 +-
 .../fix-frame-reg-in-custom-csr-spills.ll     |    2 +-
 ...frame-setup-without-sgpr-to-vgpr-spills.ll |   25 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     |    8 +-
 .../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll   |  148 +-
 .../AMDGPU/gfx-callable-argument-types.ll     | 1226 +--
 .../gfx-callable-preserved-registers.ll       |   72 +-
 .../AMDGPU/gfx-callable-return-types.ll       |   42 +-
 llvm/test/CodeGen/AMDGPU/global-alias.ll      |    8 +-
 .../identical-subrange-spill-infloop.ll       |   96 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     | 1126 +--
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |    4 +-
 .../CodeGen/AMDGPU/insert-waitcnts-crash.ll   |   12 +-
 llvm/test/CodeGen/AMDGPU/issue176578.ll       |   60 +-
 ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll |  175 +-
 .../AMDGPU/materialize-frame-index-sgpr.ll    | 1634 ++--
 .../CodeGen/AMDGPU/mul24-pass-ordering.ll     |   20 +-
 .../AMDGPU/need-fp-from-vgpr-spills.ll        |   10 +-
 llvm/test/CodeGen/AMDGPU/nested-calls.ll      |    4 +-
 .../AMDGPU/no-source-locations-in-prologue.ll |    4 +-
 llvm/test/CodeGen/AMDGPU/nofpclass-call.ll    |   12 +-
 .../AMDGPU/preserve-wwm-copy-dst-reg.ll       |   25 +-
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll |   10 +-
 .../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir |  190 +-
 .../AMDGPU/sgpr-spills-split-regalloc.ll      |   27 +-
 .../AMDGPU/shufflevector.v2i64.v8i64.ll       | 1458 +--
 .../si-lower-sgpr-spills-vgpr-lanes-usage.mir |   18 +-
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      |  222 +-
 llvm/test/CodeGen/AMDGPU/stack-realign.ll     |    2 +-
 .../CodeGen/AMDGPU/stacksave_stackrestore.ll  |   47 +-
 .../AMDGPU/strictfp_f16_abi_promote.ll        |   14 +-
 .../CodeGen/AMDGPU/swdev504645-global-fold.ll |    3 +-
 .../tail-call-inreg-arguments.waterfall.ll    |   88 +-
 ...unfold-masked-merge-scalar-variablemask.ll |   38 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |  173 +-
 .../AMDGPU/vgpr-mark-last-scratch-load.ll     |   60 +-
 .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll   |   12 +-
 ...terfall-call-target-av-register-failure.ll |    6 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |    4 +-
 .../CodeGen/AMDGPU/whole-wave-functions.ll    |   25 +-
 .../AMDGPU/whole-wave-register-copy.ll        |    4 +-
 .../AMDGPU/whole-wave-register-spill.ll       |    4 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   19 +-
 95 files changed, 18153 insertions(+), 18044 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index e37fa91b6afba..ebeb6b9d1d348 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -285,11 +285,20 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
 
     std::vector<CalleeSavedInfo> CSI;
     const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+    MCRegister RetAddrReg = TRI->getReturnAddressReg(MF);
+    MCRegister RetAddrRegSub0 = TRI->getSubReg(RetAddrReg, AMDGPU::sub0);
+    MCRegister RetAddrRegSub1 = TRI->getSubReg(RetAddrReg, AMDGPU::sub1);
+    bool SpillRetAddrReg = false;
 
     for (unsigned I = 0; CSRegs[I]; ++I) {
       MCRegister Reg = CSRegs[I];
 
       if (SavedRegs.test(Reg)) {
+        if (Reg == RetAddrRegSub0 || Reg == RetAddrRegSub1) {
+          SpillRetAddrReg = true;
+          continue;
+        }
+
         const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
         int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
                                            TRI->getSpillAlign(*RC), true);
@@ -299,6 +308,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
       }
     }
 
+    // Return address uses a register pair. Add the super register to the
+    // CSI list so that it's easier to identify the entire spill and CFI
+    // can be emitted appropriately.
+    if (SpillRetAddrReg) {
+      const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(RetAddrReg, MVT::i64);
+      int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
+                                         TRI->getSpillAlign(*RC), true);
+      CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI));
+      CalleeSavedFIs.push_back(JunkFI);
+    }
+
     if (!CSI.empty()) {
       for (MachineBasicBlock *SaveBlock : SaveBlocks)
         insertCSRSaves(*SaveBlock, CSI, Indexes, LIS);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index 62fe5f101b458..e42f9e8cb1001 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -3,7 +3,7 @@
 
 declare hidden ptr addrspace(1) @ext(ptr addrspace(1))
 
-define ptr addrspace(1) @call_assert_align() {
+define ptr addrspace(1) @call_assert_align() #0 {
 ; CHECK-LABEL: call_assert_align:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23,10 +23,10 @@ define ptr addrspace(1) @call_assert_align() {
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -41,7 +41,7 @@ entry:
   ret ptr addrspace(1) %call
 }
 
-define ptr addrspace(1) @tail_call_assert_align() {
+define ptr addrspace(1) @tail_call_assert_align() #0 {
 ; CHECK-LABEL: tail_call_assert_align:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -55,3 +55,5 @@ entry:
   %call = tail call align 4 ptr addrspace(1) @ext(ptr addrspace(1) null)
   ret ptr addrspace(1) %call
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 8f8a52b72241d..faa54d4209f8e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -238,8 +238,8 @@ define void @func_caller_stack() #2 {
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -277,8 +277,8 @@ define void @func_caller_stack() #2 {
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -363,8 +363,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
 ; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -414,8 +414,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 0ecb3e8e94f0c..8225ea1f8fda7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -245,8 +245,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], 0
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e0fdfec35cb75..bb29fb8757f0f 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -30,8 +30,8 @@ define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-NEXT:    s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
 ; FIXEDABI-NEXT:    v_writelane_b32 v40, s31, 1
 ; FIXEDABI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
 ; FIXEDABI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
 ; FIXEDABI-NEXT:    s_mov_b32 s32, s33
 ; FIXEDABI-NEXT:    v_readlane_b32 s4, v40, 2
 ; FIXEDABI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -416,5 +416,5 @@ declare i1 @llvm.amdgcn.is.private(ptr)
 declare void @llvm.trap()
 declare void @llvm.debugtrap()
 
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" nounwind }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index c78544bee46a2..60ce2ce2d99ae 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -35,8 +35,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
 ; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v41, off, s33
 ; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v42, 2
 ; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -78,8 +78,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
 ; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v41, off, s33
 ; GISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v42, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v42, 1
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s0, v42, 2
 ; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -787,8 +787,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 2
 ; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
@@ -822,8 +822,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s0, v40, 2
 ; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index b16f33a94a551..5a83334419285 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -7885,42 +7885,42 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v20, s30, 0
-; SI-NEXT:    v_writelane_b32 v20, s31, 1
-; SI-NEXT:    v_writelane_b32 v20, s34, 2
-; SI-NEXT:    v_writelane_b32 v20, s35, 3
-; SI-NEXT:    v_writelane_b32 v20, s36, 4
-; SI-NEXT:    v_writelane_b32 v20, s37, 5
-; SI-NEXT:    v_writelane_b32 v20, s38, 6
-; SI-NEXT:    v_writelane_b32 v20, s39, 7
-; SI-NEXT:    v_writelane_b32 v20, s48, 8
-; SI-NEXT:    v_writelane_b32 v20, s49, 9
-; SI-NEXT:    v_writelane_b32 v20, s50, 10
-; SI-NEXT:    v_writelane_b32 v20, s51, 11
-; SI-NEXT:    v_writelane_b32 v20, s52, 12
-; SI-NEXT:    v_writelane_b32 v20, s53, 13
-; SI-NEXT:    v_writelane_b32 v20, s54, 14
-; SI-NEXT:    v_writelane_b32 v20, s55, 15
-; SI-NEXT:    v_writelane_b32 v20, s64, 16
-; SI-NEXT:    v_writelane_b32 v20, s65, 17
-; SI-NEXT:    v_writelane_b32 v20, s66, 18
-; SI-NEXT:    v_writelane_b32 v20, s67, 19
-; SI-NEXT:    v_writelane_b32 v20, s68, 20
-; SI-NEXT:    v_writelane_b32 v20, s69, 21
-; SI-NEXT:    v_writelane_b32 v20, s70, 22
-; SI-NEXT:    v_writelane_b32 v20, s71, 23
-; SI-NEXT:    v_writelane_b32 v20, s80, 24
-; SI-NEXT:    v_writelane_b32 v20, s81, 25
-; SI-NEXT:    v_writelane_b32 v20, s82, 26
-; SI-NEXT:    v_writelane_b32 v20, s83, 27
-; SI-NEXT:    v_writelane_b32 v20, s84, 28
-; SI-NEXT:    v_writelane_b32 v20, s85, 29
-; SI-NEXT:    v_writelane_b32 v20, s86, 30
-; SI-NEXT:    v_writelane_b32 v20, s87, 31
-; SI-NEXT:    v_writelane_b32 v20, s96, 32
-; SI-NEXT:    v_writelane_b32 v20, s97, 33
+; SI-NEXT:    v_writelane_b32 v20, s34, 0
+; SI-NEXT:    v_writelane_b32 v20, s35, 1
+; SI-NEXT:    v_writelane_b32 v20, s36, 2
+; SI-NEXT:    v_writelane_b32 v20, s37, 3
+; SI-NEXT:    v_writelane_b32 v20, s38, 4
+; SI-NEXT:    v_writelane_b32 v20, s39, 5
+; SI-NEXT:    v_writelane_b32 v20, s48, 6
+; SI-NEXT:    v_writelane_b32 v20, s49, 7
+; SI-NEXT:    v_writelane_b32 v20, s50, 8
+; SI-NEXT:    v_writelane_b32 v20, s51, 9
+; SI-NEXT:    v_writelane_b32 v20, s52, 10
+; SI-NEXT:    v_writelane_b32 v20, s53, 11
+; SI-NEXT:    v_writelane_b32 v20, s54, 12
+; SI-NEXT:    v_writelane_b32 v20, s55, 13
+; SI-NEXT:    v_writelane_b32 v20, s64, 14
+; SI-NEXT:    v_writelane_b32 v20, s65, 15
+; SI-NEXT:    v_writelane_b32 v20, s66, 16
+; SI-NEXT:    v_writelane_b32 v20, s67, 17
+; SI-NEXT:    v_writelane_b32 v20, s68, 18
+; SI-NEXT:    v_writelane_b32 v20, s69, 19
+; SI-NEXT:    v_writelane_b32 v20, s70, 20
+; SI-NEXT:    v_writelane_b32 v20, s71, 21
+; SI-NEXT:    v_writelane_b32 v20, s80, 22
+; SI-NEXT:    v_writelane_b32 v20, s81, 23
+; SI-NEXT:    v_writelane_b32 v20, s82, 24
+; SI-NEXT:    v_writelane_b32 v20, s83, 25
+; SI-NEXT:    v_writelane_b32 v20, s84, 26
+; SI-NEXT:    v_writelane_b32 v20, s85, 27
+; SI-NEXT:    v_writelane_b32 v20, s86, 28
+; SI-NEXT:    v_writelane_b32 v20, s87, 29
+; SI-NEXT:    v_writelane_b32 v20, s96, 30
+; SI-NEXT:    v_writelane_b32 v20, s97, 31
+; SI-NEXT:    v_writelane_b32 v20, s98, 32
+; SI-NEXT:    v_writelane_b32 v20, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s44, v19
-; SI-NEXT:    v_writelane_b32 v20, s98, 34
+; SI-NEXT:    v_writelane_b32 v20, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s5, v18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
 ; SI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -7940,7 +7940,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s45, v2
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
-; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    v_writelane_b32 v20, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
 ; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
@@ -8810,6 +8810,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v20, 34
 ; SI-NEXT:    v_readlane_b32 s19, v22, 11
 ; SI-NEXT:    v_readlane_b32 s17, v22, 17
 ; SI-NEXT:    v_readlane_b32 s15, v22, 23
@@ -8817,42 +8818,41 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    v_readlane_b32 s11, v22, 35
 ; SI-NEXT:    v_readlane_b32 s9, v22, 41
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v20, 35
-; SI-NEXT:    v_readlane_b32 s98, v20, 34
-; SI-NEXT:    v_readlane_b32 s97, v20, 33
-; SI-NEXT:    v_readlane_b32 s96, v20, 32
-; SI-NEXT:    v_readlane_b32 s87, v20, 31
-; SI-NEXT:    v_readlane_b32 s86, v20, 30
-; SI-NEXT:    v_readlane_b32 s85, v20, 29
-; SI-NEXT:    v_readlane_b32 s84, v20, 28
-; SI-NEXT:    v_readlane_b32 s83, v20, 27
-; SI-NEXT:    v_readlane_b32 s82, v20, 26
-; SI-NEXT:    v_readlane_b32 s81, v20, 25
-; SI-NEXT:    v_readlane_b32 s80, v20, 24
-; SI-NEXT:    v_readlane_b32 s71, v20, 23
-; SI-NEXT:    v_readlane_b32 s70, v20, 22
-; SI-NEXT:    v_readlane_b32 s69, v20, 21
-; SI-NEXT:    v_readlane_b32 s68, v20, 20
-; SI-NEXT:    v_readlane_b32 s67, v20, 19
-; SI-NEXT:    v_readlane_b32 s66, v20, 18
-; SI-NEXT:    v_readlane_b32 s65, v20, 17
-; SI-NEXT:    v_readlane_b32 s64, v20, 16
-; SI-NEXT:    v_readlane_b32 s55, v20, 15
-; SI-NEXT:    v_readlane_b32 s54, v20, 14
-; SI-NEXT:    v_readlane_b32 s53, v20, 13
-; SI-NEXT:    v_readlane_b32 s52, v20, 12
-; SI-NEXT:    v_readlane_b32 s51, v20, 11
-; SI-NEXT:    v_readlane_b32 s50, v20, 10
-; SI-NEXT:    v_readlane_b32 s49, v20, 9
-; SI-NEXT:    v_readlane_b32 s48, v20, 8
-; SI-NEXT:    v_readlane_b32 s39, v20, 7
-; SI-NEXT:    v_readlane_b32 s38, v20, 6
-; SI-NEXT:    v_readlane_b32 s37, v20, 5
-; SI-NEXT:    v_readlane_b32 s36, v20, 4
-; SI-NEXT:    v_readlane_b32 s35, v20, 3
-; SI-NEXT:    v_readlane_b32 s34, v20, 2
-; SI-NEXT:    v_readlane_b32 s31, v20, 1
-; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    v_readlane_b32 s31, v20, 35
+; SI-NEXT:    v_readlane_b32 s99, v20, 33
+; SI-NEXT:    v_readlane_b32 s98, v20, 32
+; SI-NEXT:    v_readlane_b32 s97, v20, 31
+; SI-NEXT:    v_readlane_b32 s96, v20, 30
+; SI-NEXT:    v_readlane_b32 s87, v20, 29
+; SI-NEXT:    v_readlane_b32 s86, v20, 28
+; SI-NEXT:    v_readlane_b32 s85, v20, 27
+; SI-NEXT:    v_readlane_b32 s84, v20, 26
+; SI-NEXT:    v_readlane_b32 s83, v20, 25
+; SI-NEXT:    v_readlane_b32 s82, v20, 24
+; SI-NEXT:    v_readlane_b32 s81, v20, 23
+; SI-NEXT:    v_readlane_b32 s80, v20, 22
+; SI-NEXT:    v_readlane_b32 s71, v20, 21
+; SI-NEXT:    v_readlane_b32 s70, v20, 20
+; SI-NEXT:    v_readlane_b32 s69, v20, 19
+; SI-NEXT:    v_readlane_b32 s68, v20, 18
+; SI-NEXT:    v_readlane_b32 s67, v20, 17
+; SI-NEXT:    v_readlane_b32 s66, v20, 16
+; SI-NEXT:    v_readlane_b32 s65, v20, 15
+; SI-NEXT:    v_readlane_b32 s64, v20, 14
+; SI-NEXT:    v_readlane_b32 s55, v20, 13
+; SI-NEXT:    v_readlane_b32 s54, v20, 12
+; SI-NEXT:    v_readlane_b32 s53, v20, 11
+; SI-NEXT:    v_readlane_b32 s52, v20, 10
+; SI-NEXT:    v_readlane_b32 s51, v20, 9
+; SI-NEXT:    v_readlane_b32 s50, v20, 8
+; SI-NEXT:    v_readlane_b32 s49, v20, 7
+; SI-NEXT:    v_readlane_b32 s48, v20, 6
+; SI-NEXT:    v_readlane_b32 s39, v20, 5
+; SI-NEXT:    v_readlane_b32 s38, v20, 4
+; SI-NEXT:    v_readlane_b32 s37, v20, 3
+; SI-NEXT:    v_readlane_b32 s36, v20, 2
+; SI-NEXT:    v_readlane_b32 s35, v20, 1
+; SI-NEXT:    v_readlane_b32 s34, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -9049,38 +9049,38 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -9100,7 +9100,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s45, v2
 ; VI-NEXT:    s_cmp_lg_u32 s44, 0
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
 ; VI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -9759,40 +9759,40 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
 ; VI-NEXT:    v_readlane_b32 s7, v33, 1
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -9963,42 +9963,42 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v29, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v29, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v29, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v29, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v29, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v29, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v29, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v29, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v29, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v29, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v29, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v29, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v29, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v29, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v29, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v29, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v29, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v29, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v29, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v29, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v29, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v29, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v29, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v29, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v29, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v29, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v29, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v29, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v29, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v29, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v29, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v29, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v29, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v29, s97, 33
+; GFX9-NEXT:    v_writelane_b32 v29, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v29, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v29, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v29, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v29, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v29, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v29, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v29, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v29, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v29, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v29, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v29, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v29, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v29, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v29, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v29, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v29, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v29, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v29, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v29, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v29, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v29, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v29, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v29, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v29, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v29, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v29, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v29, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v29, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v29, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v29, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v29, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v29, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v29, s99, 33
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v19
-; GFX9-NEXT:    v_writelane_b32 v29, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v29, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v16
@@ -10018,7 +10018,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s44, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
-; GFX9-NEXT:    v_writelane_b32 v29, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v29, s31, 35
 ; GFX9-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
@@ -10623,43 +10623,43 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX9-NEXT:    v_perm_b32 v1, s4, v3, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_readlane_b32 s30, v29, 34
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    v_readlane_b32 s99, v29, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v29, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v29, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v29, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v29, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v29, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v29, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v29, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v29, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v29, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v29, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v29, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v29, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v29, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v29, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v29, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v29, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v29, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v29, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v29, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v29, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v29, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v29, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v29, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v29, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v29, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v29, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v29, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v29, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v29, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v29, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v29, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v29, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v29, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v29, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v29, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v29, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v29, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v29, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v29, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v29, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v29, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v29, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v29, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v29, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v29, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v29, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v29, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v29, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v29, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v29, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v29, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v29, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v29, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v29, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v29, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v29, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v29, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v29, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v29, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v29, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v29, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v29, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v29, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v29, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v29, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v29, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v29, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v29, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v29, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v29, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -10825,66 +10825,66 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v26, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v27, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v24, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v25, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v24, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v25, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v24, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v25, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v24, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v25, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v24, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v25, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v24, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v25, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v24, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v25, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v24, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v25, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v24, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v25, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v24, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v25, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v24, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v25, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v24, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v25, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    ; implicit-def: $vgpr27 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr26 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v24, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v25, s102, 6
-; GFX11-NEXT:    v_writelane_b32 v24, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v25, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v24, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v25, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v24, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v24, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v24, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v24, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v24, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v24, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v24, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v24, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v24, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v24, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v24, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v24, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v24, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v24, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v24, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v24, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v24, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v24, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v24, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v24, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v24, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v24, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v24, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v24, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v25, s104, 6
+; GFX11-NEXT:    v_writelane_b32 v24, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v25, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v24, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v25, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v24, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v24, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v24, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v24, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v24, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v24, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v24, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v24, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v24, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v24, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v24, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v24, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v24, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v24, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v24, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v24, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v24, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v24, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v24, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v24, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v24, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v24, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v24, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s5, 24
@@ -11213,11 +11213,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    s_lshr_b64 s[42:43], s[4:5], 24
 ; GFX11-NEXT:  .LBB13_3: ; %end
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0xc0c0004
-; GFX11-NEXT:    v_readlane_b32 s31, v24, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v2, s103, s30, v1
-; GFX11-NEXT:    v_readlane_b32 s103, v25, 7
-; GFX11-NEXT:    v_readlane_b32 s30, v24, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v25, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v25, 8
+; GFX11-NEXT:    v_readlane_b32 s103, v25, 5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_perm_b32 v19, s83, s81, v1
 ; GFX11-NEXT:    v_perm_b32 v3, s0, s104, v1
@@ -11298,95 +11298,95 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 1
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 31
 ; GFX11-NEXT:    v_or_b32_e32 v9, v9, v3
-; GFX11-NEXT:    v_readlane_b32 s104, v25, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v25, 6
 ; GFX11-NEXT:    v_or_b32_e32 v3, v10, v11
 ; GFX11-NEXT:    v_perm_b32 v12, s0, s62, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 2
 ; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s102, v25, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v25, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v25, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v25, 3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
 ; GFX11-NEXT:    v_perm_b32 v4, s40, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 30
-; GFX11-NEXT:    v_readlane_b32 s100, v25, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v25, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v25, 2
+; GFX11-NEXT:    v_readlane_b32 s100, v25, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v25, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v25, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX11-NEXT:    v_perm_b32 v10, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 0
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 26
-; GFX11-NEXT:    v_readlane_b32 s97, v25, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v25, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v24, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v24, 30
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX11-NEXT:    v_perm_b32 v5, s41, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 28
-; GFX11-NEXT:    v_readlane_b32 s87, v24, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v24, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v24, 29
+; GFX11-NEXT:    v_readlane_b32 s87, v24, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v24, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v24, 27
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_perm_b32 v11, s0, s60, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 25
-; GFX11-NEXT:    v_readlane_b32 s84, v24, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v24, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v24, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v24, 25
 ; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off offset:64
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX11-NEXT:    v_perm_b32 v6, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 29
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 21
-; GFX11-NEXT:    v_readlane_b32 s82, v24, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v24, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v24, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v24, 23
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
 ; GFX11-NEXT:    v_perm_b32 v8, s14, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 27
-; GFX11-NEXT:    v_readlane_b32 s80, v24, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v24, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v24, 22
+; GFX11-NEXT:    v_readlane_b32 s80, v24, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v24, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v24, 20
 ; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
 ; GFX11-NEXT:    v_perm_b32 v10, s15, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 23
-; GFX11-NEXT:    v_readlane_b32 s69, v24, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v24, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v24, 19
+; GFX11-NEXT:    v_readlane_b32 s69, v24, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v24, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v24, 17
 ; GFX11-NEXT:    v_or_b32_e32 v7, v10, v11
 ; GFX11-NEXT:    v_perm_b32 v12, s0, s58, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 24
-; GFX11-NEXT:    v_readlane_b32 s66, v24, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v24, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v24, 16
+; GFX11-NEXT:    v_readlane_b32 s66, v24, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v24, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v24, 14
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
 ; GFX11-NEXT:    v_perm_b32 v8, s12, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 20
-; GFX11-NEXT:    v_readlane_b32 s55, v24, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v24, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v24, 13
+; GFX11-NEXT:    v_readlane_b32 s55, v24, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v24, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v24, 11
 ; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX11-NEXT:    v_perm_b32 v10, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 22
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 16
-; GFX11-NEXT:    v_readlane_b32 s52, v24, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v24, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v24, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v24, 9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
 ; GFX11-NEXT:    v_perm_b32 v9, s13, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 18
-; GFX11-NEXT:    v_readlane_b32 s50, v24, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v24, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v24, 8
+; GFX11-NEXT:    v_readlane_b32 s50, v24, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v24, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v24, 6
 ; GFX11-NEXT:    v_or_b32_e32 v9, v9, v3
 ; GFX11-NEXT:    v_perm_b32 v11, s0, s56, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 15
-; GFX11-NEXT:    v_readlane_b32 s39, v24, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v24, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v24, 5
+; GFX11-NEXT:    v_readlane_b32 s39, v24, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v24, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v24, 3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
 ; GFX11-NEXT:    v_perm_b32 v2, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 19
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 11
-; GFX11-NEXT:    v_readlane_b32 s36, v24, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v24, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v24, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v24, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_perm_b32 v4, s10, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 17
-; GFX11-NEXT:    v_readlane_b32 s34, v24, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v24, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
 ; GFX11-NEXT:    v_perm_b32 v10, s11, s0, v1
@@ -22280,43 +22280,42 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v33, s30, 0
-; SI-NEXT:    v_writelane_b32 v33, s31, 1
-; SI-NEXT:    v_writelane_b32 v33, s34, 2
-; SI-NEXT:    v_writelane_b32 v33, s35, 3
-; SI-NEXT:    v_writelane_b32 v33, s36, 4
-; SI-NEXT:    v_writelane_b32 v33, s37, 5
-; SI-NEXT:    v_writelane_b32 v33, s38, 6
-; SI-NEXT:    v_writelane_b32 v33, s39, 7
-; SI-NEXT:    v_writelane_b32 v33, s48, 8
-; SI-NEXT:    v_writelane_b32 v33, s49, 9
-; SI-NEXT:    v_writelane_b32 v33, s50, 10
-; SI-NEXT:    v_writelane_b32 v33, s51, 11
-; SI-NEXT:    v_writelane_b32 v33, s52, 12
-; SI-NEXT:    v_writelane_b32 v33, s53, 13
-; SI-NEXT:    v_writelane_b32 v33, s54, 14
-; SI-NEXT:    v_writelane_b32 v33, s55, 15
-; SI-NEXT:    v_writelane_b32 v33, s64, 16
-; SI-NEXT:    v_writelane_b32 v33, s65, 17
-; SI-NEXT:    v_writelane_b32 v33, s66, 18
-; SI-NEXT:    v_writelane_b32 v33, s67, 19
-; SI-NEXT:    v_writelane_b32 v33, s68, 20
-; SI-NEXT:    v_writelane_b32 v33, s69, 21
-; SI-NEXT:    v_writelane_b32 v33, s70, 22
-; SI-NEXT:    v_writelane_b32 v33, s71, 23
-; SI-NEXT:    v_writelane_b32 v33, s80, 24
-; SI-NEXT:    v_writelane_b32 v33, s81, 25
-; SI-NEXT:    v_writelane_b32 v33, s82, 26
-; SI-NEXT:    v_writelane_b32 v33, s83, 27
-; SI-NEXT:    v_writelane_b32 v33, s84, 28
-; SI-NEXT:    v_writelane_b32 v33, s85, 29
-; SI-NEXT:    v_writelane_b32 v33, s86, 30
-; SI-NEXT:    v_writelane_b32 v33, s87, 31
-; SI-NEXT:    v_writelane_b32 v33, s96, 32
-; SI-NEXT:    v_writelane_b32 v33, s97, 33
-; SI-NEXT:    v_writelane_b32 v33, s98, 34
+; SI-NEXT:    v_writelane_b32 v33, s34, 0
+; SI-NEXT:    v_writelane_b32 v33, s35, 1
+; SI-NEXT:    v_writelane_b32 v33, s36, 2
+; SI-NEXT:    v_writelane_b32 v33, s37, 3
+; SI-NEXT:    v_writelane_b32 v33, s38, 4
+; SI-NEXT:    v_writelane_b32 v33, s39, 5
+; SI-NEXT:    v_writelane_b32 v33, s48, 6
+; SI-NEXT:    v_writelane_b32 v33, s49, 7
+; SI-NEXT:    v_writelane_b32 v33, s50, 8
+; SI-NEXT:    v_writelane_b32 v33, s51, 9
+; SI-NEXT:    v_writelane_b32 v33, s52, 10
+; SI-NEXT:    v_writelane_b32 v33, s53, 11
+; SI-NEXT:    v_writelane_b32 v33, s54, 12
+; SI-NEXT:    v_writelane_b32 v33, s55, 13
+; SI-NEXT:    v_writelane_b32 v33, s64, 14
+; SI-NEXT:    v_writelane_b32 v33, s65, 15
+; SI-NEXT:    v_writelane_b32 v33, s66, 16
+; SI-NEXT:    v_writelane_b32 v33, s67, 17
+; SI-NEXT:    v_writelane_b32 v33, s68, 18
+; SI-NEXT:    v_writelane_b32 v33, s69, 19
+; SI-NEXT:    v_writelane_b32 v33, s70, 20
+; SI-NEXT:    v_writelane_b32 v33, s71, 21
+; SI-NEXT:    v_writelane_b32 v33, s80, 22
+; SI-NEXT:    v_writelane_b32 v33, s81, 23
+; SI-NEXT:    v_writelane_b32 v33, s82, 24
+; SI-NEXT:    v_writelane_b32 v33, s83, 25
+; SI-NEXT:    v_writelane_b32 v33, s84, 26
+; SI-NEXT:    v_writelane_b32 v33, s85, 27
+; SI-NEXT:    v_writelane_b32 v33, s86, 28
+; SI-NEXT:    v_writelane_b32 v33, s87, 29
+; SI-NEXT:    v_writelane_b32 v33, s96, 30
+; SI-NEXT:    v_writelane_b32 v33, s97, 31
+; SI-NEXT:    v_writelane_b32 v33, s98, 32
+; SI-NEXT:    v_writelane_b32 v33, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v33, s99, 35
+; SI-NEXT:    v_writelane_b32 v33, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s70, v17
 ; SI-NEXT:    v_readfirstlane_b32 s71, v16
 ; SI-NEXT:    v_readfirstlane_b32 s80, v15
@@ -22336,6 +22335,7 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s8, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s9, v0
+; SI-NEXT:    v_writelane_b32 v33, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -22745,43 +22745,43 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s5
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT:    v_readlane_b32 s30, v33, 34
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT:    v_readlane_b32 s99, v33, 35
-; SI-NEXT:    v_readlane_b32 s98, v33, 34
-; SI-NEXT:    v_readlane_b32 s97, v33, 33
-; SI-NEXT:    v_readlane_b32 s96, v33, 32
-; SI-NEXT:    v_readlane_b32 s87, v33, 31
-; SI-NEXT:    v_readlane_b32 s86, v33, 30
-; SI-NEXT:    v_readlane_b32 s85, v33, 29
-; SI-NEXT:    v_readlane_b32 s84, v33, 28
-; SI-NEXT:    v_readlane_b32 s83, v33, 27
-; SI-NEXT:    v_readlane_b32 s82, v33, 26
-; SI-NEXT:    v_readlane_b32 s81, v33, 25
-; SI-NEXT:    v_readlane_b32 s80, v33, 24
-; SI-NEXT:    v_readlane_b32 s71, v33, 23
-; SI-NEXT:    v_readlane_b32 s70, v33, 22
-; SI-NEXT:    v_readlane_b32 s69, v33, 21
-; SI-NEXT:    v_readlane_b32 s68, v33, 20
-; SI-NEXT:    v_readlane_b32 s67, v33, 19
-; SI-NEXT:    v_readlane_b32 s66, v33, 18
-; SI-NEXT:    v_readlane_b32 s65, v33, 17
-; SI-NEXT:    v_readlane_b32 s64, v33, 16
-; SI-NEXT:    v_readlane_b32 s55, v33, 15
-; SI-NEXT:    v_readlane_b32 s54, v33, 14
-; SI-NEXT:    v_readlane_b32 s53, v33, 13
-; SI-NEXT:    v_readlane_b32 s52, v33, 12
-; SI-NEXT:    v_readlane_b32 s51, v33, 11
-; SI-NEXT:    v_readlane_b32 s50, v33, 10
-; SI-NEXT:    v_readlane_b32 s49, v33, 9
-; SI-NEXT:    v_readlane_b32 s48, v33, 8
-; SI-NEXT:    v_readlane_b32 s39, v33, 7
-; SI-NEXT:    v_readlane_b32 s38, v33, 6
-; SI-NEXT:    v_readlane_b32 s37, v33, 5
-; SI-NEXT:    v_readlane_b32 s36, v33, 4
-; SI-NEXT:    v_readlane_b32 s35, v33, 3
-; SI-NEXT:    v_readlane_b32 s34, v33, 2
-; SI-NEXT:    v_readlane_b32 s31, v33, 1
-; SI-NEXT:    v_readlane_b32 s30, v33, 0
+; SI-NEXT:    v_readlane_b32 s31, v33, 35
+; SI-NEXT:    v_readlane_b32 s99, v33, 33
+; SI-NEXT:    v_readlane_b32 s98, v33, 32
+; SI-NEXT:    v_readlane_b32 s97, v33, 31
+; SI-NEXT:    v_readlane_b32 s96, v33, 30
+; SI-NEXT:    v_readlane_b32 s87, v33, 29
+; SI-NEXT:    v_readlane_b32 s86, v33, 28
+; SI-NEXT:    v_readlane_b32 s85, v33, 27
+; SI-NEXT:    v_readlane_b32 s84, v33, 26
+; SI-NEXT:    v_readlane_b32 s83, v33, 25
+; SI-NEXT:    v_readlane_b32 s82, v33, 24
+; SI-NEXT:    v_readlane_b32 s81, v33, 23
+; SI-NEXT:    v_readlane_b32 s80, v33, 22
+; SI-NEXT:    v_readlane_b32 s71, v33, 21
+; SI-NEXT:    v_readlane_b32 s70, v33, 20
+; SI-NEXT:    v_readlane_b32 s69, v33, 19
+; SI-NEXT:    v_readlane_b32 s68, v33, 18
+; SI-NEXT:    v_readlane_b32 s67, v33, 17
+; SI-NEXT:    v_readlane_b32 s66, v33, 16
+; SI-NEXT:    v_readlane_b32 s65, v33, 15
+; SI-NEXT:    v_readlane_b32 s64, v33, 14
+; SI-NEXT:    v_readlane_b32 s55, v33, 13
+; SI-NEXT:    v_readlane_b32 s54, v33, 12
+; SI-NEXT:    v_readlane_b32 s53, v33, 11
+; SI-NEXT:    v_readlane_b32 s52, v33, 10
+; SI-NEXT:    v_readlane_b32 s51, v33, 9
+; SI-NEXT:    v_readlane_b32 s50, v33, 8
+; SI-NEXT:    v_readlane_b32 s49, v33, 7
+; SI-NEXT:    v_readlane_b32 s48, v33, 6
+; SI-NEXT:    v_readlane_b32 s39, v33, 5
+; SI-NEXT:    v_readlane_b32 s38, v33, 4
+; SI-NEXT:    v_readlane_b32 s37, v33, 3
+; SI-NEXT:    v_readlane_b32 s36, v33, 2
+; SI-NEXT:    v_readlane_b32 s35, v33, 1
+; SI-NEXT:    v_readlane_b32 s34, v33, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -26076,15 +26076,17 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s12, s25, 0xffff0000
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
@@ -26162,13 +26164,11 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s47
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_and_b32 s43, s42, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -26882,12 +26882,12 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30301,28 +30301,28 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -30342,7 +30342,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s45, v1
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s31, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB21_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 16
@@ -30540,6 +30540,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s46, s38, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 20
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -30572,28 +30573,27 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 21
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -31888,45 +31888,46 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; SI-NEXT:    v_readfirstlane_b32 s10, v15
 ; SI-NEXT:    v_readfirstlane_b32 s12, v14
@@ -31946,7 +31947,6 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s83, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v35, vcc_lo, 0
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_lshr_b32 s69, s29, 16
 ; SI-NEXT:    s_lshr_b32 s71, s28, 16
 ; SI-NEXT:    s_lshr_b32 s82, s27, 16
@@ -32380,42 +32380,42 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB23_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -33476,28 +33476,28 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -33517,7 +33517,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s45, v1
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s31, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 16
@@ -33715,6 +33715,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s46, s38, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 20
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -33747,28 +33748,27 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 21
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34893,45 +34893,46 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_lshr_b32 s14, s9, 16
 ; SI-NEXT:    v_readfirstlane_b32 s13, v14
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v17
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
 ; SI-NEXT:    s_lshr_b32 s72, s13, 16
@@ -34951,7 +34952,6 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s97, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s14, 0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s95, s28, 16
 ; SI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -35256,6 +35256,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s67, s4, 0x30000
 ; SI-NEXT:  .LBB27_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -35288,42 +35289,41 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -42711,43 +42711,43 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s4, v19
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s45, v18
 ; SI-NEXT:    v_readfirstlane_b32 s44, v17
 ; SI-NEXT:    v_readfirstlane_b32 s47, v16
@@ -43673,32 +43673,31 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s4, s70, 8
 ; SI-NEXT:    s_lshl_b32 s5, s66, 24
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
@@ -43720,9 +43719,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v21
 ; SI-NEXT:    s_lshl_b32 s4, s64, 8
 ; SI-NEXT:    s_lshl_b32 s5, s52, 24
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
@@ -43754,8 +43753,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    s_lshl_b32 s4, s18, 8
 ; SI-NEXT:    s_lshl_b32 s5, s48, 24
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -43786,8 +43785,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_lshl_b32 s4, s20, 8
 ; SI-NEXT:    s_lshl_b32 s5, s36, 24
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -43818,8 +43817,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_lshl_b32 s4, s22, 8
 ; SI-NEXT:    s_lshl_b32 s5, s34, 24
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -43888,7 +43887,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s5, s90, 24
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v59
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_or_b32_e32 v2, s4, v2
@@ -44044,39 +44044,39 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
-; VI-NEXT:    v_writelane_b32 v63, s68, 20
-; VI-NEXT:    v_writelane_b32 v63, s69, 21
-; VI-NEXT:    v_writelane_b32 v63, s70, 22
-; VI-NEXT:    v_writelane_b32 v63, s71, 23
-; VI-NEXT:    v_writelane_b32 v63, s80, 24
-; VI-NEXT:    v_writelane_b32 v63, s81, 25
-; VI-NEXT:    v_writelane_b32 v63, s82, 26
-; VI-NEXT:    v_writelane_b32 v63, s83, 27
-; VI-NEXT:    v_writelane_b32 v63, s84, 28
-; VI-NEXT:    v_writelane_b32 v63, s85, 29
-; VI-NEXT:    v_writelane_b32 v63, s86, 30
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s68, 18
+; VI-NEXT:    v_writelane_b32 v63, s69, 19
+; VI-NEXT:    v_writelane_b32 v63, s70, 20
+; VI-NEXT:    v_writelane_b32 v63, s71, 21
+; VI-NEXT:    v_writelane_b32 v63, s80, 22
+; VI-NEXT:    v_writelane_b32 v63, s81, 23
+; VI-NEXT:    v_writelane_b32 v63, s82, 24
+; VI-NEXT:    v_writelane_b32 v63, s83, 25
+; VI-NEXT:    v_writelane_b32 v63, s84, 26
+; VI-NEXT:    v_writelane_b32 v63, s85, 27
+; VI-NEXT:    v_writelane_b32 v63, s86, 28
+; VI-NEXT:    v_writelane_b32 v63, s87, 29
+; VI-NEXT:    v_writelane_b32 v63, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v63, s87, 31
+; VI-NEXT:    v_writelane_b32 v63, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -44953,38 +44953,38 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v25, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; VI-NEXT:    v_readlane_b32 s87, v63, 31
-; VI-NEXT:    v_readlane_b32 s86, v63, 30
-; VI-NEXT:    v_readlane_b32 s85, v63, 29
-; VI-NEXT:    v_readlane_b32 s84, v63, 28
-; VI-NEXT:    v_readlane_b32 s83, v63, 27
-; VI-NEXT:    v_readlane_b32 s82, v63, 26
-; VI-NEXT:    v_readlane_b32 s81, v63, 25
-; VI-NEXT:    v_readlane_b32 s80, v63, 24
-; VI-NEXT:    v_readlane_b32 s71, v63, 23
-; VI-NEXT:    v_readlane_b32 s70, v63, 22
-; VI-NEXT:    v_readlane_b32 s69, v63, 21
-; VI-NEXT:    v_readlane_b32 s68, v63, 20
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 30
+; VI-NEXT:    v_readlane_b32 s31, v63, 31
+; VI-NEXT:    v_readlane_b32 s87, v63, 29
+; VI-NEXT:    v_readlane_b32 s86, v63, 28
+; VI-NEXT:    v_readlane_b32 s85, v63, 27
+; VI-NEXT:    v_readlane_b32 s84, v63, 26
+; VI-NEXT:    v_readlane_b32 s83, v63, 25
+; VI-NEXT:    v_readlane_b32 s82, v63, 24
+; VI-NEXT:    v_readlane_b32 s81, v63, 23
+; VI-NEXT:    v_readlane_b32 s80, v63, 22
+; VI-NEXT:    v_readlane_b32 s71, v63, 21
+; VI-NEXT:    v_readlane_b32 s70, v63, 20
+; VI-NEXT:    v_readlane_b32 s69, v63, 19
+; VI-NEXT:    v_readlane_b32 s68, v63, 18
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v23, v24, v23, s4
 ; VI-NEXT:    v_perm_b32 v24, v34, v35, s4
@@ -45273,43 +45273,43 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v16
@@ -46152,42 +46152,42 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_perm_b32 v25, v25, v46, s4
 ; GFX9-NEXT:    v_perm_b32 v26, v26, v45, s4
 ; GFX9-NEXT:    v_perm_b32 v23, v23, v43, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_perm_b32 v24, v24, v15, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -46481,33 +46481,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -46529,37 +46529,37 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB37_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s5, 24
@@ -47283,47 +47283,47 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -58004,43 +58004,43 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
 ; SI-NEXT:    v_readfirstlane_b32 s7, v16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v15
@@ -58514,42 +58514,42 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v54
 ; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
 ; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v50
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -58639,17 +58639,17 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -58742,16 +58742,16 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB41_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -58764,17 +58764,17 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -58867,16 +58867,16 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB41_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -62008,15 +62008,17 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s12, s25, 0xffff0000
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
@@ -62094,13 +62096,11 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s47
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_and_b32 s43, s42, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -62814,12 +62814,12 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -66217,29 +66217,29 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s31, 21
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -66592,28 +66592,28 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v30
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 20
+; SI-NEXT:    v_readlane_b32 s31, v63, 21
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v30, v30, v32
@@ -66631,17 +66631,17 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -66734,16 +66734,16 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB45_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -66756,17 +66756,17 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -66859,16 +66859,16 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB45_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -67968,45 +67968,46 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; SI-NEXT:    v_readfirstlane_b32 s10, v15
 ; SI-NEXT:    v_readfirstlane_b32 s12, v14
@@ -68026,7 +68027,6 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s83, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v35, vcc_lo, 0
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_lshr_b32 s69, s29, 16
 ; SI-NEXT:    s_lshr_b32 s71, s28, 16
 ; SI-NEXT:    s_lshr_b32 s82, s27, 16
@@ -68460,42 +68460,42 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -69540,29 +69540,29 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s31, 21
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -69915,28 +69915,28 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v30
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 20
+; SI-NEXT:    v_readlane_b32 s31, v63, 21
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v30, v30, v32
@@ -69954,17 +69954,17 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -70057,16 +70057,16 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB49_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -70079,17 +70079,17 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -70182,16 +70182,16 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB49_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -71121,45 +71121,46 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_lshr_b32 s14, s9, 16
 ; SI-NEXT:    v_readfirstlane_b32 s13, v14
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v17
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
 ; SI-NEXT:    s_lshr_b32 s72, s13, 16
@@ -71179,7 +71180,6 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s97, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s14, 0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s95, s28, 16
 ; SI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -71484,6 +71484,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s67, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -71516,42 +71517,41 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -77366,42 +77366,42 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v20, s30, 0
-; SI-NEXT:    v_writelane_b32 v20, s31, 1
-; SI-NEXT:    v_writelane_b32 v20, s34, 2
-; SI-NEXT:    v_writelane_b32 v20, s35, 3
-; SI-NEXT:    v_writelane_b32 v20, s36, 4
-; SI-NEXT:    v_writelane_b32 v20, s37, 5
-; SI-NEXT:    v_writelane_b32 v20, s38, 6
-; SI-NEXT:    v_writelane_b32 v20, s39, 7
-; SI-NEXT:    v_writelane_b32 v20, s48, 8
-; SI-NEXT:    v_writelane_b32 v20, s49, 9
-; SI-NEXT:    v_writelane_b32 v20, s50, 10
-; SI-NEXT:    v_writelane_b32 v20, s51, 11
-; SI-NEXT:    v_writelane_b32 v20, s52, 12
-; SI-NEXT:    v_writelane_b32 v20, s53, 13
-; SI-NEXT:    v_writelane_b32 v20, s54, 14
-; SI-NEXT:    v_writelane_b32 v20, s55, 15
-; SI-NEXT:    v_writelane_b32 v20, s64, 16
-; SI-NEXT:    v_writelane_b32 v20, s65, 17
-; SI-NEXT:    v_writelane_b32 v20, s66, 18
-; SI-NEXT:    v_writelane_b32 v20, s67, 19
-; SI-NEXT:    v_writelane_b32 v20, s68, 20
-; SI-NEXT:    v_writelane_b32 v20, s69, 21
-; SI-NEXT:    v_writelane_b32 v20, s70, 22
-; SI-NEXT:    v_writelane_b32 v20, s71, 23
-; SI-NEXT:    v_writelane_b32 v20, s80, 24
-; SI-NEXT:    v_writelane_b32 v20, s81, 25
-; SI-NEXT:    v_writelane_b32 v20, s82, 26
-; SI-NEXT:    v_writelane_b32 v20, s83, 27
-; SI-NEXT:    v_writelane_b32 v20, s84, 28
-; SI-NEXT:    v_writelane_b32 v20, s85, 29
-; SI-NEXT:    v_writelane_b32 v20, s86, 30
-; SI-NEXT:    v_writelane_b32 v20, s87, 31
-; SI-NEXT:    v_writelane_b32 v20, s96, 32
-; SI-NEXT:    v_writelane_b32 v20, s97, 33
+; SI-NEXT:    v_writelane_b32 v20, s34, 0
+; SI-NEXT:    v_writelane_b32 v20, s35, 1
+; SI-NEXT:    v_writelane_b32 v20, s36, 2
+; SI-NEXT:    v_writelane_b32 v20, s37, 3
+; SI-NEXT:    v_writelane_b32 v20, s38, 4
+; SI-NEXT:    v_writelane_b32 v20, s39, 5
+; SI-NEXT:    v_writelane_b32 v20, s48, 6
+; SI-NEXT:    v_writelane_b32 v20, s49, 7
+; SI-NEXT:    v_writelane_b32 v20, s50, 8
+; SI-NEXT:    v_writelane_b32 v20, s51, 9
+; SI-NEXT:    v_writelane_b32 v20, s52, 10
+; SI-NEXT:    v_writelane_b32 v20, s53, 11
+; SI-NEXT:    v_writelane_b32 v20, s54, 12
+; SI-NEXT:    v_writelane_b32 v20, s55, 13
+; SI-NEXT:    v_writelane_b32 v20, s64, 14
+; SI-NEXT:    v_writelane_b32 v20, s65, 15
+; SI-NEXT:    v_writelane_b32 v20, s66, 16
+; SI-NEXT:    v_writelane_b32 v20, s67, 17
+; SI-NEXT:    v_writelane_b32 v20, s68, 18
+; SI-NEXT:    v_writelane_b32 v20, s69, 19
+; SI-NEXT:    v_writelane_b32 v20, s70, 20
+; SI-NEXT:    v_writelane_b32 v20, s71, 21
+; SI-NEXT:    v_writelane_b32 v20, s80, 22
+; SI-NEXT:    v_writelane_b32 v20, s81, 23
+; SI-NEXT:    v_writelane_b32 v20, s82, 24
+; SI-NEXT:    v_writelane_b32 v20, s83, 25
+; SI-NEXT:    v_writelane_b32 v20, s84, 26
+; SI-NEXT:    v_writelane_b32 v20, s85, 27
+; SI-NEXT:    v_writelane_b32 v20, s86, 28
+; SI-NEXT:    v_writelane_b32 v20, s87, 29
+; SI-NEXT:    v_writelane_b32 v20, s96, 30
+; SI-NEXT:    v_writelane_b32 v20, s97, 31
+; SI-NEXT:    v_writelane_b32 v20, s98, 32
+; SI-NEXT:    v_writelane_b32 v20, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s44, v19
-; SI-NEXT:    v_writelane_b32 v20, s98, 34
+; SI-NEXT:    v_writelane_b32 v20, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s5, v18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
 ; SI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -77421,7 +77421,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s45, v2
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
-; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    v_writelane_b32 v20, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
 ; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_4
@@ -78291,6 +78291,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v20, 34
 ; SI-NEXT:    v_readlane_b32 s19, v22, 11
 ; SI-NEXT:    v_readlane_b32 s17, v22, 17
 ; SI-NEXT:    v_readlane_b32 s15, v22, 23
@@ -78298,42 +78299,41 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    v_readlane_b32 s11, v22, 35
 ; SI-NEXT:    v_readlane_b32 s9, v22, 39
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v20, 35
-; SI-NEXT:    v_readlane_b32 s98, v20, 34
-; SI-NEXT:    v_readlane_b32 s97, v20, 33
-; SI-NEXT:    v_readlane_b32 s96, v20, 32
-; SI-NEXT:    v_readlane_b32 s87, v20, 31
-; SI-NEXT:    v_readlane_b32 s86, v20, 30
-; SI-NEXT:    v_readlane_b32 s85, v20, 29
-; SI-NEXT:    v_readlane_b32 s84, v20, 28
-; SI-NEXT:    v_readlane_b32 s83, v20, 27
-; SI-NEXT:    v_readlane_b32 s82, v20, 26
-; SI-NEXT:    v_readlane_b32 s81, v20, 25
-; SI-NEXT:    v_readlane_b32 s80, v20, 24
-; SI-NEXT:    v_readlane_b32 s71, v20, 23
-; SI-NEXT:    v_readlane_b32 s70, v20, 22
-; SI-NEXT:    v_readlane_b32 s69, v20, 21
-; SI-NEXT:    v_readlane_b32 s68, v20, 20
-; SI-NEXT:    v_readlane_b32 s67, v20, 19
-; SI-NEXT:    v_readlane_b32 s66, v20, 18
-; SI-NEXT:    v_readlane_b32 s65, v20, 17
-; SI-NEXT:    v_readlane_b32 s64, v20, 16
-; SI-NEXT:    v_readlane_b32 s55, v20, 15
-; SI-NEXT:    v_readlane_b32 s54, v20, 14
-; SI-NEXT:    v_readlane_b32 s53, v20, 13
-; SI-NEXT:    v_readlane_b32 s52, v20, 12
-; SI-NEXT:    v_readlane_b32 s51, v20, 11
-; SI-NEXT:    v_readlane_b32 s50, v20, 10
-; SI-NEXT:    v_readlane_b32 s49, v20, 9
-; SI-NEXT:    v_readlane_b32 s48, v20, 8
-; SI-NEXT:    v_readlane_b32 s39, v20, 7
-; SI-NEXT:    v_readlane_b32 s38, v20, 6
-; SI-NEXT:    v_readlane_b32 s37, v20, 5
-; SI-NEXT:    v_readlane_b32 s36, v20, 4
-; SI-NEXT:    v_readlane_b32 s35, v20, 3
-; SI-NEXT:    v_readlane_b32 s34, v20, 2
-; SI-NEXT:    v_readlane_b32 s31, v20, 1
-; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    v_readlane_b32 s31, v20, 35
+; SI-NEXT:    v_readlane_b32 s99, v20, 33
+; SI-NEXT:    v_readlane_b32 s98, v20, 32
+; SI-NEXT:    v_readlane_b32 s97, v20, 31
+; SI-NEXT:    v_readlane_b32 s96, v20, 30
+; SI-NEXT:    v_readlane_b32 s87, v20, 29
+; SI-NEXT:    v_readlane_b32 s86, v20, 28
+; SI-NEXT:    v_readlane_b32 s85, v20, 27
+; SI-NEXT:    v_readlane_b32 s84, v20, 26
+; SI-NEXT:    v_readlane_b32 s83, v20, 25
+; SI-NEXT:    v_readlane_b32 s82, v20, 24
+; SI-NEXT:    v_readlane_b32 s81, v20, 23
+; SI-NEXT:    v_readlane_b32 s80, v20, 22
+; SI-NEXT:    v_readlane_b32 s71, v20, 21
+; SI-NEXT:    v_readlane_b32 s70, v20, 20
+; SI-NEXT:    v_readlane_b32 s69, v20, 19
+; SI-NEXT:    v_readlane_b32 s68, v20, 18
+; SI-NEXT:    v_readlane_b32 s67, v20, 17
+; SI-NEXT:    v_readlane_b32 s66, v20, 16
+; SI-NEXT:    v_readlane_b32 s65, v20, 15
+; SI-NEXT:    v_readlane_b32 s64, v20, 14
+; SI-NEXT:    v_readlane_b32 s55, v20, 13
+; SI-NEXT:    v_readlane_b32 s54, v20, 12
+; SI-NEXT:    v_readlane_b32 s53, v20, 11
+; SI-NEXT:    v_readlane_b32 s52, v20, 10
+; SI-NEXT:    v_readlane_b32 s51, v20, 9
+; SI-NEXT:    v_readlane_b32 s50, v20, 8
+; SI-NEXT:    v_readlane_b32 s49, v20, 7
+; SI-NEXT:    v_readlane_b32 s48, v20, 6
+; SI-NEXT:    v_readlane_b32 s39, v20, 5
+; SI-NEXT:    v_readlane_b32 s38, v20, 4
+; SI-NEXT:    v_readlane_b32 s37, v20, 3
+; SI-NEXT:    v_readlane_b32 s36, v20, 2
+; SI-NEXT:    v_readlane_b32 s35, v20, 1
+; SI-NEXT:    v_readlane_b32 s34, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -78530,38 +78530,38 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -78581,7 +78581,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s45, v2
 ; VI-NEXT:    s_cmp_lg_u32 s44, 0
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
 ; VI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -79240,40 +79240,40 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
 ; VI-NEXT:    v_readlane_b32 s7, v33, 1
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -79444,42 +79444,42 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v29, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v29, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v29, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v29, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v29, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v29, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v29, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v29, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v29, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v29, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v29, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v29, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v29, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v29, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v29, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v29, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v29, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v29, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v29, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v29, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v29, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v29, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v29, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v29, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v29, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v29, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v29, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v29, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v29, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v29, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v29, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v29, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v29, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v29, s97, 33
+; GFX9-NEXT:    v_writelane_b32 v29, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v29, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v29, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v29, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v29, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v29, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v29, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v29, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v29, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v29, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v29, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v29, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v29, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v29, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v29, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v29, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v29, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v29, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v29, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v29, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v29, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v29, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v29, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v29, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v29, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v29, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v29, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v29, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v29, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v29, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v29, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v29, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v29, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v29, s99, 33
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v19
-; GFX9-NEXT:    v_writelane_b32 v29, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v29, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v16
@@ -79499,7 +79499,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s44, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
-; GFX9-NEXT:    v_writelane_b32 v29, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v29, s31, 35
 ; GFX9-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB57_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
@@ -80104,43 +80104,43 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX9-NEXT:    v_perm_b32 v1, s4, v3, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_readlane_b32 s30, v29, 34
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    v_readlane_b32 s99, v29, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v29, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v29, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v29, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v29, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v29, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v29, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v29, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v29, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v29, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v29, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v29, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v29, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v29, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v29, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v29, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v29, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v29, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v29, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v29, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v29, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v29, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v29, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v29, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v29, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v29, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v29, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v29, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v29, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v29, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v29, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v29, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v29, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v29, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v29, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v29, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v29, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v29, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v29, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v29, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v29, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v29, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v29, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v29, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v29, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v29, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v29, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v29, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v29, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v29, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v29, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v29, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v29, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v29, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v29, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v29, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v29, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v29, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v29, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v29, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v29, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v29, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v29, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v29, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v29, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v29, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v29, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v29, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v29, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v29, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v29, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -80306,66 +80306,66 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v26, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v27, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v24, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v25, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v24, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v25, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v24, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v25, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v24, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v25, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v24, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v25, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v24, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v25, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v24, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v25, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v24, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v25, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v24, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v25, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v24, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v25, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v24, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v25, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v24, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v25, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    ; implicit-def: $vgpr27 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr26 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v24, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v25, s102, 6
-; GFX11-NEXT:    v_writelane_b32 v24, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v25, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v24, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v25, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v24, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v24, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v24, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v24, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v24, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v24, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v24, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v24, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v24, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v24, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v24, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v24, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v24, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v24, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v24, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v24, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v24, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v24, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v24, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v24, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v24, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v24, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v24, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v24, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v25, s104, 6
+; GFX11-NEXT:    v_writelane_b32 v24, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v25, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v24, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v25, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v24, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v24, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v24, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v24, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v24, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v24, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v24, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v24, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v24, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v24, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v24, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v24, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v24, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v24, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v24, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v24, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v24, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v24, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v24, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v24, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v24, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v24, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v24, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB57_4
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s5, 24
@@ -80699,11 +80699,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    s_lshr_b64 s[42:43], s[4:5], 24
 ; GFX11-NEXT:  .LBB57_3: ; %end
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0xc0c0004
-; GFX11-NEXT:    v_readlane_b32 s31, v24, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v2, s103, s30, v1
-; GFX11-NEXT:    v_readlane_b32 s103, v25, 7
-; GFX11-NEXT:    v_readlane_b32 s30, v24, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v25, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v25, 8
+; GFX11-NEXT:    v_readlane_b32 s103, v25, 5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_perm_b32 v19, s83, s81, v1
 ; GFX11-NEXT:    v_perm_b32 v3, s0, s104, v1
@@ -80784,95 +80784,95 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 1
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 31
 ; GFX11-NEXT:    v_or_b32_e32 v9, v9, v3
-; GFX11-NEXT:    v_readlane_b32 s104, v25, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v25, 6
 ; GFX11-NEXT:    v_or_b32_e32 v3, v10, v11
 ; GFX11-NEXT:    v_perm_b32 v12, s0, s62, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 2
 ; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s102, v25, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v25, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v25, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v25, 3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
 ; GFX11-NEXT:    v_perm_b32 v4, s40, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 30
-; GFX11-NEXT:    v_readlane_b32 s100, v25, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v25, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v25, 2
+; GFX11-NEXT:    v_readlane_b32 s100, v25, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v25, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v25, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX11-NEXT:    v_perm_b32 v10, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v26, 0
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 26
-; GFX11-NEXT:    v_readlane_b32 s97, v25, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v25, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v24, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v24, 30
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX11-NEXT:    v_perm_b32 v5, s41, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 28
-; GFX11-NEXT:    v_readlane_b32 s87, v24, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v24, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v24, 29
+; GFX11-NEXT:    v_readlane_b32 s87, v24, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v24, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v24, 27
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_perm_b32 v11, s0, s60, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 25
-; GFX11-NEXT:    v_readlane_b32 s84, v24, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v24, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v24, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v24, 25
 ; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off offset:64
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX11-NEXT:    v_perm_b32 v6, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 29
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 21
-; GFX11-NEXT:    v_readlane_b32 s82, v24, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v24, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v24, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v24, 23
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
 ; GFX11-NEXT:    v_perm_b32 v8, s14, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 27
-; GFX11-NEXT:    v_readlane_b32 s80, v24, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v24, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v24, 22
+; GFX11-NEXT:    v_readlane_b32 s80, v24, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v24, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v24, 20
 ; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
 ; GFX11-NEXT:    v_perm_b32 v10, s15, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 23
-; GFX11-NEXT:    v_readlane_b32 s69, v24, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v24, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v24, 19
+; GFX11-NEXT:    v_readlane_b32 s69, v24, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v24, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v24, 17
 ; GFX11-NEXT:    v_or_b32_e32 v7, v10, v11
 ; GFX11-NEXT:    v_perm_b32 v12, s0, s58, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 24
-; GFX11-NEXT:    v_readlane_b32 s66, v24, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v24, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v24, 16
+; GFX11-NEXT:    v_readlane_b32 s66, v24, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v24, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v24, 14
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
 ; GFX11-NEXT:    v_perm_b32 v8, s12, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 20
-; GFX11-NEXT:    v_readlane_b32 s55, v24, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v24, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v24, 13
+; GFX11-NEXT:    v_readlane_b32 s55, v24, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v24, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v24, 11
 ; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX11-NEXT:    v_perm_b32 v10, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 22
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 16
-; GFX11-NEXT:    v_readlane_b32 s52, v24, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v24, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v24, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v24, 9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
 ; GFX11-NEXT:    v_perm_b32 v9, s13, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 18
-; GFX11-NEXT:    v_readlane_b32 s50, v24, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v24, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v24, 8
+; GFX11-NEXT:    v_readlane_b32 s50, v24, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v24, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v24, 6
 ; GFX11-NEXT:    v_or_b32_e32 v9, v9, v3
 ; GFX11-NEXT:    v_perm_b32 v11, s0, s56, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 15
-; GFX11-NEXT:    v_readlane_b32 s39, v24, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v24, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v24, 5
+; GFX11-NEXT:    v_readlane_b32 s39, v24, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v24, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v24, 3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
 ; GFX11-NEXT:    v_perm_b32 v2, s1, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 19
 ; GFX11-NEXT:    v_readlane_b32 s1, v27, 11
-; GFX11-NEXT:    v_readlane_b32 s36, v24, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v24, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v24, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v24, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
 ; GFX11-NEXT:    v_perm_b32 v4, s10, s0, v1
 ; GFX11-NEXT:    v_readlane_b32 s0, v27, 17
-; GFX11-NEXT:    v_readlane_b32 s34, v24, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v24, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
 ; GFX11-NEXT:    v_perm_b32 v10, s11, s0, v1
@@ -91774,43 +91774,42 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
 ; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v33, s30, 0
-; SI-NEXT:    v_writelane_b32 v33, s31, 1
-; SI-NEXT:    v_writelane_b32 v33, s34, 2
-; SI-NEXT:    v_writelane_b32 v33, s35, 3
-; SI-NEXT:    v_writelane_b32 v33, s36, 4
-; SI-NEXT:    v_writelane_b32 v33, s37, 5
-; SI-NEXT:    v_writelane_b32 v33, s38, 6
-; SI-NEXT:    v_writelane_b32 v33, s39, 7
-; SI-NEXT:    v_writelane_b32 v33, s48, 8
-; SI-NEXT:    v_writelane_b32 v33, s49, 9
-; SI-NEXT:    v_writelane_b32 v33, s50, 10
-; SI-NEXT:    v_writelane_b32 v33, s51, 11
-; SI-NEXT:    v_writelane_b32 v33, s52, 12
-; SI-NEXT:    v_writelane_b32 v33, s53, 13
-; SI-NEXT:    v_writelane_b32 v33, s54, 14
-; SI-NEXT:    v_writelane_b32 v33, s55, 15
-; SI-NEXT:    v_writelane_b32 v33, s64, 16
-; SI-NEXT:    v_writelane_b32 v33, s65, 17
-; SI-NEXT:    v_writelane_b32 v33, s66, 18
-; SI-NEXT:    v_writelane_b32 v33, s67, 19
-; SI-NEXT:    v_writelane_b32 v33, s68, 20
-; SI-NEXT:    v_writelane_b32 v33, s69, 21
-; SI-NEXT:    v_writelane_b32 v33, s70, 22
-; SI-NEXT:    v_writelane_b32 v33, s71, 23
-; SI-NEXT:    v_writelane_b32 v33, s80, 24
-; SI-NEXT:    v_writelane_b32 v33, s81, 25
-; SI-NEXT:    v_writelane_b32 v33, s82, 26
-; SI-NEXT:    v_writelane_b32 v33, s83, 27
-; SI-NEXT:    v_writelane_b32 v33, s84, 28
-; SI-NEXT:    v_writelane_b32 v33, s85, 29
-; SI-NEXT:    v_writelane_b32 v33, s86, 30
-; SI-NEXT:    v_writelane_b32 v33, s87, 31
-; SI-NEXT:    v_writelane_b32 v33, s96, 32
-; SI-NEXT:    v_writelane_b32 v33, s97, 33
-; SI-NEXT:    v_writelane_b32 v33, s98, 34
+; SI-NEXT:    v_writelane_b32 v33, s34, 0
+; SI-NEXT:    v_writelane_b32 v33, s35, 1
+; SI-NEXT:    v_writelane_b32 v33, s36, 2
+; SI-NEXT:    v_writelane_b32 v33, s37, 3
+; SI-NEXT:    v_writelane_b32 v33, s38, 4
+; SI-NEXT:    v_writelane_b32 v33, s39, 5
+; SI-NEXT:    v_writelane_b32 v33, s48, 6
+; SI-NEXT:    v_writelane_b32 v33, s49, 7
+; SI-NEXT:    v_writelane_b32 v33, s50, 8
+; SI-NEXT:    v_writelane_b32 v33, s51, 9
+; SI-NEXT:    v_writelane_b32 v33, s52, 10
+; SI-NEXT:    v_writelane_b32 v33, s53, 11
+; SI-NEXT:    v_writelane_b32 v33, s54, 12
+; SI-NEXT:    v_writelane_b32 v33, s55, 13
+; SI-NEXT:    v_writelane_b32 v33, s64, 14
+; SI-NEXT:    v_writelane_b32 v33, s65, 15
+; SI-NEXT:    v_writelane_b32 v33, s66, 16
+; SI-NEXT:    v_writelane_b32 v33, s67, 17
+; SI-NEXT:    v_writelane_b32 v33, s68, 18
+; SI-NEXT:    v_writelane_b32 v33, s69, 19
+; SI-NEXT:    v_writelane_b32 v33, s70, 20
+; SI-NEXT:    v_writelane_b32 v33, s71, 21
+; SI-NEXT:    v_writelane_b32 v33, s80, 22
+; SI-NEXT:    v_writelane_b32 v33, s81, 23
+; SI-NEXT:    v_writelane_b32 v33, s82, 24
+; SI-NEXT:    v_writelane_b32 v33, s83, 25
+; SI-NEXT:    v_writelane_b32 v33, s84, 26
+; SI-NEXT:    v_writelane_b32 v33, s85, 27
+; SI-NEXT:    v_writelane_b32 v33, s86, 28
+; SI-NEXT:    v_writelane_b32 v33, s87, 29
+; SI-NEXT:    v_writelane_b32 v33, s96, 30
+; SI-NEXT:    v_writelane_b32 v33, s97, 31
+; SI-NEXT:    v_writelane_b32 v33, s98, 32
+; SI-NEXT:    v_writelane_b32 v33, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v33, s99, 35
+; SI-NEXT:    v_writelane_b32 v33, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s70, v17
 ; SI-NEXT:    v_readfirstlane_b32 s71, v16
 ; SI-NEXT:    v_readfirstlane_b32 s80, v15
@@ -91830,6 +91829,7 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s8, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s9, v0
+; SI-NEXT:    v_writelane_b32 v33, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB61_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -92137,43 +92137,43 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
 ; SI-NEXT:    v_readlane_b32 s4, v34, 1
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT:    v_readlane_b32 s30, v33, 34
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT:    v_readlane_b32 s99, v33, 35
-; SI-NEXT:    v_readlane_b32 s98, v33, 34
-; SI-NEXT:    v_readlane_b32 s97, v33, 33
-; SI-NEXT:    v_readlane_b32 s96, v33, 32
-; SI-NEXT:    v_readlane_b32 s87, v33, 31
-; SI-NEXT:    v_readlane_b32 s86, v33, 30
-; SI-NEXT:    v_readlane_b32 s85, v33, 29
-; SI-NEXT:    v_readlane_b32 s84, v33, 28
-; SI-NEXT:    v_readlane_b32 s83, v33, 27
-; SI-NEXT:    v_readlane_b32 s82, v33, 26
-; SI-NEXT:    v_readlane_b32 s81, v33, 25
-; SI-NEXT:    v_readlane_b32 s80, v33, 24
-; SI-NEXT:    v_readlane_b32 s71, v33, 23
-; SI-NEXT:    v_readlane_b32 s70, v33, 22
-; SI-NEXT:    v_readlane_b32 s69, v33, 21
-; SI-NEXT:    v_readlane_b32 s68, v33, 20
-; SI-NEXT:    v_readlane_b32 s67, v33, 19
-; SI-NEXT:    v_readlane_b32 s66, v33, 18
-; SI-NEXT:    v_readlane_b32 s65, v33, 17
-; SI-NEXT:    v_readlane_b32 s64, v33, 16
-; SI-NEXT:    v_readlane_b32 s55, v33, 15
-; SI-NEXT:    v_readlane_b32 s54, v33, 14
-; SI-NEXT:    v_readlane_b32 s53, v33, 13
-; SI-NEXT:    v_readlane_b32 s52, v33, 12
-; SI-NEXT:    v_readlane_b32 s51, v33, 11
-; SI-NEXT:    v_readlane_b32 s50, v33, 10
-; SI-NEXT:    v_readlane_b32 s49, v33, 9
-; SI-NEXT:    v_readlane_b32 s48, v33, 8
-; SI-NEXT:    v_readlane_b32 s39, v33, 7
-; SI-NEXT:    v_readlane_b32 s38, v33, 6
-; SI-NEXT:    v_readlane_b32 s37, v33, 5
-; SI-NEXT:    v_readlane_b32 s36, v33, 4
-; SI-NEXT:    v_readlane_b32 s35, v33, 3
-; SI-NEXT:    v_readlane_b32 s34, v33, 2
-; SI-NEXT:    v_readlane_b32 s31, v33, 1
-; SI-NEXT:    v_readlane_b32 s30, v33, 0
+; SI-NEXT:    v_readlane_b32 s31, v33, 35
+; SI-NEXT:    v_readlane_b32 s99, v33, 33
+; SI-NEXT:    v_readlane_b32 s98, v33, 32
+; SI-NEXT:    v_readlane_b32 s97, v33, 31
+; SI-NEXT:    v_readlane_b32 s96, v33, 30
+; SI-NEXT:    v_readlane_b32 s87, v33, 29
+; SI-NEXT:    v_readlane_b32 s86, v33, 28
+; SI-NEXT:    v_readlane_b32 s85, v33, 27
+; SI-NEXT:    v_readlane_b32 s84, v33, 26
+; SI-NEXT:    v_readlane_b32 s83, v33, 25
+; SI-NEXT:    v_readlane_b32 s82, v33, 24
+; SI-NEXT:    v_readlane_b32 s81, v33, 23
+; SI-NEXT:    v_readlane_b32 s80, v33, 22
+; SI-NEXT:    v_readlane_b32 s71, v33, 21
+; SI-NEXT:    v_readlane_b32 s70, v33, 20
+; SI-NEXT:    v_readlane_b32 s69, v33, 19
+; SI-NEXT:    v_readlane_b32 s68, v33, 18
+; SI-NEXT:    v_readlane_b32 s67, v33, 17
+; SI-NEXT:    v_readlane_b32 s66, v33, 16
+; SI-NEXT:    v_readlane_b32 s65, v33, 15
+; SI-NEXT:    v_readlane_b32 s64, v33, 14
+; SI-NEXT:    v_readlane_b32 s55, v33, 13
+; SI-NEXT:    v_readlane_b32 s54, v33, 12
+; SI-NEXT:    v_readlane_b32 s53, v33, 11
+; SI-NEXT:    v_readlane_b32 s52, v33, 10
+; SI-NEXT:    v_readlane_b32 s51, v33, 9
+; SI-NEXT:    v_readlane_b32 s50, v33, 8
+; SI-NEXT:    v_readlane_b32 s49, v33, 7
+; SI-NEXT:    v_readlane_b32 s48, v33, 6
+; SI-NEXT:    v_readlane_b32 s39, v33, 5
+; SI-NEXT:    v_readlane_b32 s38, v33, 4
+; SI-NEXT:    v_readlane_b32 s37, v33, 3
+; SI-NEXT:    v_readlane_b32 s36, v33, 2
+; SI-NEXT:    v_readlane_b32 s35, v33, 1
+; SI-NEXT:    v_readlane_b32 s34, v33, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -95538,15 +95538,17 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s12, s25, 0xffff0000
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
@@ -95624,13 +95626,11 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s47
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_and_b32 s43, s42, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -96344,12 +96344,12 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -99771,28 +99771,28 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -99812,7 +99812,7 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s45, v1
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s31, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB65_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 16
@@ -100010,6 +100010,7 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s46, s38, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 20
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -100042,28 +100043,27 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 21
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -101358,45 +101358,46 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; SI-NEXT:    v_readfirstlane_b32 s10, v15
 ; SI-NEXT:    v_readfirstlane_b32 s12, v14
@@ -101416,7 +101417,6 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s83, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v35, vcc_lo, 0
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_lshr_b32 s69, s29, 16
 ; SI-NEXT:    s_lshr_b32 s71, s28, 16
 ; SI-NEXT:    s_lshr_b32 s82, s27, 16
@@ -101850,42 +101850,42 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB67_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -102954,28 +102954,28 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -102995,7 +102995,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s45, v1
 ; SI-NEXT:    s_cmp_lg_u32 s44, 0
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s31, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 16
@@ -103193,6 +103193,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s46, s38, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 20
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -103225,28 +103226,27 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 21
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -104371,45 +104371,46 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_lshr_b32 s14, s9, 16
 ; SI-NEXT:    v_readfirstlane_b32 s13, v14
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v17
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
 ; SI-NEXT:    s_lshr_b32 s72, s13, 16
@@ -104429,7 +104430,6 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s97, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s14, 0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s95, s28, 16
 ; SI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -104734,6 +104734,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s67, s4, 0x30000
 ; SI-NEXT:  .LBB71_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -104766,42 +104767,41 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -109212,43 +109212,43 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s44, v19
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s5, v18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
 ; SI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -110113,38 +110113,38 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    s_lshl_b32 s6, s37, 8
 ; SI-NEXT:    s_lshl_b32 s8, s35, 24
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -110177,9 +110177,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    s_lshl_b32 s6, s34, 8
 ; SI-NEXT:    s_lshl_b32 s8, s30, 24
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v18
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -110213,7 +110212,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    s_lshl_b32 s6, s95, 8
 ; SI-NEXT:    s_lshl_b32 s8, s93, 24
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
 ; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v55
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
@@ -110603,39 +110603,39 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
-; VI-NEXT:    v_writelane_b32 v63, s68, 20
-; VI-NEXT:    v_writelane_b32 v63, s69, 21
-; VI-NEXT:    v_writelane_b32 v63, s70, 22
-; VI-NEXT:    v_writelane_b32 v63, s71, 23
-; VI-NEXT:    v_writelane_b32 v63, s80, 24
-; VI-NEXT:    v_writelane_b32 v63, s81, 25
-; VI-NEXT:    v_writelane_b32 v63, s82, 26
-; VI-NEXT:    v_writelane_b32 v63, s83, 27
-; VI-NEXT:    v_writelane_b32 v63, s84, 28
-; VI-NEXT:    v_writelane_b32 v63, s85, 29
-; VI-NEXT:    v_writelane_b32 v63, s86, 30
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s68, 18
+; VI-NEXT:    v_writelane_b32 v63, s69, 19
+; VI-NEXT:    v_writelane_b32 v63, s70, 20
+; VI-NEXT:    v_writelane_b32 v63, s71, 21
+; VI-NEXT:    v_writelane_b32 v63, s80, 22
+; VI-NEXT:    v_writelane_b32 v63, s81, 23
+; VI-NEXT:    v_writelane_b32 v63, s82, 24
+; VI-NEXT:    v_writelane_b32 v63, s83, 25
+; VI-NEXT:    v_writelane_b32 v63, s84, 26
+; VI-NEXT:    v_writelane_b32 v63, s85, 27
+; VI-NEXT:    v_writelane_b32 v63, s86, 28
+; VI-NEXT:    v_writelane_b32 v63, s87, 29
+; VI-NEXT:    v_writelane_b32 v63, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v63, s87, 31
+; VI-NEXT:    v_writelane_b32 v63, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -111467,38 +111467,38 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v29, v29, v44, s4
 ; VI-NEXT:    v_perm_b32 v27, v27, v33, s4
-; VI-NEXT:    v_readlane_b32 s87, v63, 31
-; VI-NEXT:    v_readlane_b32 s86, v63, 30
-; VI-NEXT:    v_readlane_b32 s85, v63, 29
-; VI-NEXT:    v_readlane_b32 s84, v63, 28
-; VI-NEXT:    v_readlane_b32 s83, v63, 27
-; VI-NEXT:    v_readlane_b32 s82, v63, 26
-; VI-NEXT:    v_readlane_b32 s81, v63, 25
-; VI-NEXT:    v_readlane_b32 s80, v63, 24
-; VI-NEXT:    v_readlane_b32 s71, v63, 23
-; VI-NEXT:    v_readlane_b32 s70, v63, 22
-; VI-NEXT:    v_readlane_b32 s69, v63, 21
-; VI-NEXT:    v_readlane_b32 s68, v63, 20
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 30
+; VI-NEXT:    v_readlane_b32 s31, v63, 31
+; VI-NEXT:    v_readlane_b32 s87, v63, 29
+; VI-NEXT:    v_readlane_b32 s86, v63, 28
+; VI-NEXT:    v_readlane_b32 s85, v63, 27
+; VI-NEXT:    v_readlane_b32 s84, v63, 26
+; VI-NEXT:    v_readlane_b32 s83, v63, 25
+; VI-NEXT:    v_readlane_b32 s82, v63, 24
+; VI-NEXT:    v_readlane_b32 s81, v63, 23
+; VI-NEXT:    v_readlane_b32 s80, v63, 22
+; VI-NEXT:    v_readlane_b32 s71, v63, 21
+; VI-NEXT:    v_readlane_b32 s70, v63, 20
+; VI-NEXT:    v_readlane_b32 s69, v63, 19
+; VI-NEXT:    v_readlane_b32 s68, v63, 18
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v31, v43, v31, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
@@ -111831,43 +111831,43 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v16
@@ -112720,42 +112720,42 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_e32 v26, v39, v26
 ; GFX9-NEXT:    v_perm_b32 v24, v24, v58, s4
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v55, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_perm_b32 v34, v34, v15, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
@@ -113064,33 +113064,33 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v13
-; GFX11-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v10
-; GFX11-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v7
-; GFX11-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v4
-; GFX11-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX11-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -113112,37 +113112,37 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB73_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s11, 16
@@ -113852,47 +113852,47 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -124505,43 +124505,43 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s44, v18
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s5, v17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
@@ -125003,42 +125003,42 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v54
 ; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
 ; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v38
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -125132,17 +125132,17 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -125219,16 +125219,16 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB77_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -125241,17 +125241,17 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -125328,16 +125328,16 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB77_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -128453,15 +128453,17 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s12, s25, 0xffff0000
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
@@ -128539,13 +128541,11 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s47
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_and_b32 s43, s42, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s45
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -129259,12 +129259,12 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -132615,29 +132615,29 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s31, 21
 ; SI-NEXT:    v_readfirstlane_b32 s45, v17
 ; SI-NEXT:    v_readfirstlane_b32 s44, v16
 ; SI-NEXT:    v_readfirstlane_b32 s43, v15
@@ -132967,6 +132967,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; SI-NEXT:    v_readlane_b32 s30, v63, 20
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v55
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v39
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v38
@@ -132974,28 +132975,27 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v36
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v35
 ; SI-NEXT:    v_or_b32_e32 v31, v31, v32
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s31, v63, 21
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v34
@@ -133014,17 +133014,17 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -133101,16 +133101,16 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB81_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -133123,17 +133123,17 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -133210,16 +133210,16 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB81_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -134303,45 +134303,46 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s6, v17
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; SI-NEXT:    v_readfirstlane_b32 s8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr35 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; SI-NEXT:    v_readfirstlane_b32 s10, v15
 ; SI-NEXT:    v_readfirstlane_b32 s12, v14
@@ -134361,7 +134362,6 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s83, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v35, vcc_lo, 0
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_lshr_b32 s69, s29, 16
 ; SI-NEXT:    s_lshr_b32 s71, s28, 16
 ; SI-NEXT:    s_lshr_b32 s82, s27, 16
@@ -134795,42 +134795,42 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB83_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -135828,29 +135828,29 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s30, 20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s31, 21
 ; SI-NEXT:    v_readfirstlane_b32 s45, v17
 ; SI-NEXT:    v_readfirstlane_b32 s44, v16
 ; SI-NEXT:    v_readfirstlane_b32 s43, v15
@@ -136180,6 +136180,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; SI-NEXT:    v_readlane_b32 s30, v63, 20
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v55
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v39
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v38
@@ -136187,28 +136188,27 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v36
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v35
 ; SI-NEXT:    v_or_b32_e32 v31, v31, v32
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s31, v63, 21
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v34
@@ -136227,17 +136227,17 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s36, 2
-; VI-NEXT:    v_writelane_b32 v32, s37, 3
-; VI-NEXT:    v_writelane_b32 v32, s38, 4
-; VI-NEXT:    v_writelane_b32 v32, s39, 5
-; VI-NEXT:    v_writelane_b32 v32, s48, 6
-; VI-NEXT:    v_writelane_b32 v32, s49, 7
-; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s36, 0
+; VI-NEXT:    v_writelane_b32 v32, s37, 1
+; VI-NEXT:    v_writelane_b32 v32, s38, 2
+; VI-NEXT:    v_writelane_b32 v32, s39, 3
+; VI-NEXT:    v_writelane_b32 v32, s48, 4
+; VI-NEXT:    v_writelane_b32 v32, s49, 5
+; VI-NEXT:    v_writelane_b32 v32, s50, 6
+; VI-NEXT:    v_writelane_b32 v32, s51, 7
+; VI-NEXT:    v_writelane_b32 v32, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -136314,16 +136314,16 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB85_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v32, 9
-; VI-NEXT:    v_readlane_b32 s50, v32, 8
-; VI-NEXT:    v_readlane_b32 s49, v32, 7
-; VI-NEXT:    v_readlane_b32 s48, v32, 6
-; VI-NEXT:    v_readlane_b32 s39, v32, 5
-; VI-NEXT:    v_readlane_b32 s38, v32, 4
-; VI-NEXT:    v_readlane_b32 s37, v32, 3
-; VI-NEXT:    v_readlane_b32 s36, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 8
+; VI-NEXT:    v_readlane_b32 s31, v32, 9
+; VI-NEXT:    v_readlane_b32 s51, v32, 7
+; VI-NEXT:    v_readlane_b32 s50, v32, 6
+; VI-NEXT:    v_readlane_b32 s49, v32, 5
+; VI-NEXT:    v_readlane_b32 s48, v32, 4
+; VI-NEXT:    v_readlane_b32 s39, v32, 3
+; VI-NEXT:    v_readlane_b32 s38, v32, 2
+; VI-NEXT:    v_readlane_b32 s37, v32, 1
+; VI-NEXT:    v_readlane_b32 s36, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -136336,17 +136336,17 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -136423,16 +136423,16 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB85_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -137346,45 +137346,46 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_lshr_b32 s14, s9, 16
 ; SI-NEXT:    v_readfirstlane_b32 s13, v14
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v17
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
 ; SI-NEXT:    s_lshr_b32 s72, s13, 16
@@ -137404,7 +137405,6 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s97, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s14, 0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s95, s28, 16
 ; SI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -137709,6 +137709,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s67, s4, 0x30000
 ; SI-NEXT:  .LBB87_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -137741,42 +137742,41 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -144589,31 +144589,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s4, v29
 ; SI-NEXT:    v_writelane_b32 v43, s4, 9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v28
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
 ; SI-NEXT:    v_writelane_b32 v43, s4, 10
 ; SI-NEXT:    v_readfirstlane_b32 s4, v27
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
 ; SI-NEXT:    v_writelane_b32 v43, s4, 11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
 ; SI-NEXT:    v_writelane_b32 v43, s4, 12
 ; SI-NEXT:    v_readfirstlane_b32 s4, v25
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_writelane_b32 v43, s4, 13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v24
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    v_readfirstlane_b32 s4, v23
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v22
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v21
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v20
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v19
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:328
@@ -144641,10 +144639,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:240
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:236
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
 ; SI-NEXT:    s_mov_b32 s37, s18
 ; SI-NEXT:    v_readfirstlane_b32 s18, v18
 ; SI-NEXT:    v_readfirstlane_b32 s38, v17
@@ -144654,10 +144652,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:228
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
 ; SI-NEXT:    s_mov_b32 s73, s19
 ; SI-NEXT:    v_readfirstlane_b32 s55, v14
 ; SI-NEXT:    v_readfirstlane_b32 s34, v13
@@ -144681,32 +144679,32 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s53, v6
 ; SI-NEXT:    v_readfirstlane_b32 s76, v5
 ; SI-NEXT:    v_readfirstlane_b32 s77, v4
 ; SI-NEXT:    v_readfirstlane_b32 s48, v3
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
-; SI-NEXT:    s_mov_b32 s6, s20
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s91, v19
 ; SI-NEXT:    v_readfirstlane_b32 s43, v20
@@ -144750,6 +144748,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:120
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:112
+; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT:    s_mov_b32 s6, s20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 21
 ; SI-NEXT:    s_waitcnt vmcnt(14)
@@ -146006,43 +146006,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s83
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s84
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
@@ -156310,46 +156310,49 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s56, v15
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
 ; SI-NEXT:    s_and_b32 s57, s56, 0xffff0000
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s46, v16
 ; SI-NEXT:    s_lshl_b32 s56, s56, 16
 ; SI-NEXT:    v_readfirstlane_b32 s78, v8
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v3
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s57
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s44, v17
 ; SI-NEXT:    s_and_b32 s47, s46, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s46, s46, 16
@@ -156359,7 +156362,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s56
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
 ; SI-NEXT:    v_readfirstlane_b32 s42, v18
 ; SI-NEXT:    s_and_b32 s45, s44, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s44, s44, 16
@@ -156379,7 +156381,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s46
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
@@ -156451,7 +156452,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
 ; SI-NEXT:    s_cmp_lg_u32 vcc_lo, 0
 ; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s41
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s16
@@ -157989,40 +157989,40 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    v_readlane_b32 s27, v62, 59
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
 ; SI-NEXT:    v_readlane_b32 s61, v62, 35
 ; SI-NEXT:    v_readlane_b32 s43, v62, 47
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s8, v1
@@ -158240,10 +158240,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_readlane_b32 s17, v61, 19
 ; SI-NEXT:    v_readlane_b32 s11, v61, 25
 ; SI-NEXT:    v_readlane_b32 s9, v61, 33
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
@@ -158277,39 +158277,39 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
-; VI-NEXT:    v_writelane_b32 v63, s68, 20
-; VI-NEXT:    v_writelane_b32 v63, s69, 21
-; VI-NEXT:    v_writelane_b32 v63, s70, 22
-; VI-NEXT:    v_writelane_b32 v63, s71, 23
-; VI-NEXT:    v_writelane_b32 v63, s80, 24
-; VI-NEXT:    v_writelane_b32 v63, s81, 25
-; VI-NEXT:    v_writelane_b32 v63, s82, 26
-; VI-NEXT:    v_writelane_b32 v63, s83, 27
-; VI-NEXT:    v_writelane_b32 v63, s84, 28
-; VI-NEXT:    v_writelane_b32 v63, s85, 29
-; VI-NEXT:    v_writelane_b32 v63, s86, 30
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s68, 18
+; VI-NEXT:    v_writelane_b32 v63, s69, 19
+; VI-NEXT:    v_writelane_b32 v63, s70, 20
+; VI-NEXT:    v_writelane_b32 v63, s71, 21
+; VI-NEXT:    v_writelane_b32 v63, s80, 22
+; VI-NEXT:    v_writelane_b32 v63, s81, 23
+; VI-NEXT:    v_writelane_b32 v63, s82, 24
+; VI-NEXT:    v_writelane_b32 v63, s83, 25
+; VI-NEXT:    v_writelane_b32 v63, s84, 26
+; VI-NEXT:    v_writelane_b32 v63, s85, 27
+; VI-NEXT:    v_writelane_b32 v63, s86, 28
+; VI-NEXT:    v_writelane_b32 v63, s87, 29
+; VI-NEXT:    v_writelane_b32 v63, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s4, v19
-; VI-NEXT:    v_writelane_b32 v63, s87, 31
+; VI-NEXT:    v_writelane_b32 v63, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v17
 ; VI-NEXT:    v_readfirstlane_b32 s9, v16
@@ -159780,38 +159780,38 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v5, v32, v34, s4
-; VI-NEXT:    v_readlane_b32 s87, v63, 31
-; VI-NEXT:    v_readlane_b32 s86, v63, 30
-; VI-NEXT:    v_readlane_b32 s85, v63, 29
-; VI-NEXT:    v_readlane_b32 s84, v63, 28
-; VI-NEXT:    v_readlane_b32 s83, v63, 27
-; VI-NEXT:    v_readlane_b32 s82, v63, 26
-; VI-NEXT:    v_readlane_b32 s81, v63, 25
-; VI-NEXT:    v_readlane_b32 s80, v63, 24
-; VI-NEXT:    v_readlane_b32 s71, v63, 23
-; VI-NEXT:    v_readlane_b32 s70, v63, 22
-; VI-NEXT:    v_readlane_b32 s69, v63, 21
-; VI-NEXT:    v_readlane_b32 s68, v63, 20
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 30
+; VI-NEXT:    v_readlane_b32 s31, v63, 31
+; VI-NEXT:    v_readlane_b32 s87, v63, 29
+; VI-NEXT:    v_readlane_b32 s86, v63, 28
+; VI-NEXT:    v_readlane_b32 s85, v63, 27
+; VI-NEXT:    v_readlane_b32 s84, v63, 26
+; VI-NEXT:    v_readlane_b32 s83, v63, 25
+; VI-NEXT:    v_readlane_b32 s82, v63, 24
+; VI-NEXT:    v_readlane_b32 s81, v63, 23
+; VI-NEXT:    v_readlane_b32 s80, v63, 22
+; VI-NEXT:    v_readlane_b32 s71, v63, 21
+; VI-NEXT:    v_readlane_b32 s70, v63, 20
+; VI-NEXT:    v_readlane_b32 s69, v63, 19
+; VI-NEXT:    v_readlane_b32 s68, v63, 18
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v4, v2, v4, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -160135,43 +160135,43 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v16
@@ -161553,42 +161553,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_perm_b32 v12, v12, v37, s4
 ; GFX9-NEXT:    v_perm_b32 v36, v57, v36, s4
 ; GFX9-NEXT:    v_perm_b32 v30, v58, v30, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v50, v23, v50, s4
 ; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
@@ -161916,33 +161916,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-TRUE16-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -161964,37 +161964,37 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB91_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 24
@@ -163288,47 +163288,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-TRUE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -163349,33 +163349,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-FAKE16-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -163397,37 +163397,37 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB91_3
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 24
@@ -164739,47 +164739,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-FAKE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -171060,72 +171060,70 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v43, s29, 0
 ; SI-NEXT:    v_writelane_b32 v43, s28, 1
 ; SI-NEXT:    v_writelane_b32 v43, s27, 2
 ; SI-NEXT:    v_writelane_b32 v43, s26, 3
 ; SI-NEXT:    v_writelane_b32 v43, s25, 4
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
 ; SI-NEXT:    v_writelane_b32 v43, s24, 5
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
 ; SI-NEXT:    v_writelane_b32 v43, s23, 6
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_writelane_b32 v43, s22, 7
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
 ; SI-NEXT:    v_writelane_b32 v43, s21, 8
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
 ; SI-NEXT:    v_writelane_b32 v43, s20, 9
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
 ; SI-NEXT:    v_writelane_b32 v43, s19, 10
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
 ; SI-NEXT:    v_writelane_b32 v43, s18, 11
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
 ; SI-NEXT:    v_writelane_b32 v43, s17, 12
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
 ; SI-NEXT:    v_writelane_b32 v43, s16, 13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v30
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    v_readfirstlane_b32 s4, v29
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v28
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v27
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v25
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
 ; SI-NEXT:    v_readfirstlane_b32 s4, v24
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
 ; SI-NEXT:    v_writelane_b32 v43, s4, 20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v23
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
 ; SI-NEXT:    v_writelane_b32 v43, s4, 21
 ; SI-NEXT:    v_readfirstlane_b32 s4, v22
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
 ; SI-NEXT:    v_writelane_b32 v43, s4, 22
 ; SI-NEXT:    v_readfirstlane_b32 s4, v21
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
 ; SI-NEXT:    v_writelane_b32 v43, s4, 23
 ; SI-NEXT:    v_readfirstlane_b32 s4, v20
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 24
 ; SI-NEXT:    v_readfirstlane_b32 s4, v19
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
 ; SI-NEXT:    v_writelane_b32 v43, s4, 25
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
 ; SI-NEXT:    v_writelane_b32 v43, s4, 26
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
 ; SI-NEXT:    v_readfirstlane_b32 s71, v18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v15
@@ -171193,21 +171191,23 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
 ; SI-NEXT:    v_writelane_b32 v43, s4, 38
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s28, v3
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s56, v19
 ; SI-NEXT:    v_readfirstlane_b32 s47, v20
@@ -172521,6 +172521,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s16, s68, 16
 ; SI-NEXT:    s_or_b32 s4, s4, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s16
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s17
 ; SI-NEXT:    v_mov_b32_e32 v1, s18
 ; SI-NEXT:    v_mov_b32_e32 v2, s19
@@ -172553,42 +172554,41 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
@@ -180344,60 +180344,61 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v34, s30, 0
-; SI-NEXT:    v_writelane_b32 v34, s31, 1
-; SI-NEXT:    v_writelane_b32 v34, s34, 2
-; SI-NEXT:    v_writelane_b32 v34, s35, 3
-; SI-NEXT:    v_writelane_b32 v34, s36, 4
-; SI-NEXT:    v_writelane_b32 v34, s37, 5
-; SI-NEXT:    v_writelane_b32 v34, s38, 6
-; SI-NEXT:    v_writelane_b32 v34, s39, 7
-; SI-NEXT:    v_writelane_b32 v34, s48, 8
-; SI-NEXT:    v_writelane_b32 v34, s49, 9
-; SI-NEXT:    v_writelane_b32 v34, s50, 10
-; SI-NEXT:    v_writelane_b32 v34, s51, 11
-; SI-NEXT:    v_writelane_b32 v34, s52, 12
-; SI-NEXT:    v_writelane_b32 v34, s53, 13
-; SI-NEXT:    v_writelane_b32 v34, s54, 14
-; SI-NEXT:    v_writelane_b32 v34, s55, 15
-; SI-NEXT:    v_writelane_b32 v34, s64, 16
-; SI-NEXT:    v_writelane_b32 v34, s65, 17
-; SI-NEXT:    v_writelane_b32 v34, s66, 18
-; SI-NEXT:    v_writelane_b32 v34, s67, 19
-; SI-NEXT:    v_writelane_b32 v34, s68, 20
-; SI-NEXT:    v_writelane_b32 v34, s69, 21
-; SI-NEXT:    v_writelane_b32 v34, s70, 22
-; SI-NEXT:    v_writelane_b32 v34, s71, 23
-; SI-NEXT:    v_writelane_b32 v34, s80, 24
-; SI-NEXT:    v_writelane_b32 v34, s81, 25
-; SI-NEXT:    v_writelane_b32 v34, s82, 26
+; SI-NEXT:    v_writelane_b32 v34, s34, 0
+; SI-NEXT:    v_writelane_b32 v34, s35, 1
+; SI-NEXT:    v_writelane_b32 v34, s36, 2
+; SI-NEXT:    v_writelane_b32 v34, s37, 3
+; SI-NEXT:    v_writelane_b32 v34, s38, 4
+; SI-NEXT:    v_writelane_b32 v34, s39, 5
+; SI-NEXT:    v_writelane_b32 v34, s48, 6
+; SI-NEXT:    v_writelane_b32 v34, s49, 7
+; SI-NEXT:    v_writelane_b32 v34, s50, 8
+; SI-NEXT:    v_writelane_b32 v34, s51, 9
+; SI-NEXT:    v_writelane_b32 v34, s52, 10
+; SI-NEXT:    v_writelane_b32 v34, s53, 11
+; SI-NEXT:    v_writelane_b32 v34, s54, 12
+; SI-NEXT:    v_writelane_b32 v34, s55, 13
+; SI-NEXT:    v_writelane_b32 v34, s64, 14
+; SI-NEXT:    v_writelane_b32 v34, s65, 15
+; SI-NEXT:    v_writelane_b32 v34, s66, 16
+; SI-NEXT:    v_writelane_b32 v34, s67, 17
+; SI-NEXT:    v_writelane_b32 v34, s68, 18
+; SI-NEXT:    v_writelane_b32 v34, s69, 19
+; SI-NEXT:    v_writelane_b32 v34, s70, 20
+; SI-NEXT:    v_writelane_b32 v34, s71, 21
+; SI-NEXT:    v_writelane_b32 v34, s80, 22
+; SI-NEXT:    v_writelane_b32 v34, s81, 23
+; SI-NEXT:    v_writelane_b32 v34, s82, 24
+; SI-NEXT:    v_writelane_b32 v34, s83, 25
+; SI-NEXT:    v_writelane_b32 v34, s84, 26
+; SI-NEXT:    v_writelane_b32 v34, s85, 27
+; SI-NEXT:    v_writelane_b32 v34, s86, 28
+; SI-NEXT:    v_writelane_b32 v34, s87, 29
 ; SI-NEXT:    s_lshr_b32 s5, s16, 16
 ; SI-NEXT:    ; implicit-def: $vgpr37 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v34, s83, 27
+; SI-NEXT:    v_writelane_b32 v34, s96, 30
 ; SI-NEXT:    s_lshr_b32 s6, s18, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v37, s5, 0
-; SI-NEXT:    v_writelane_b32 v34, s84, 28
+; SI-NEXT:    v_writelane_b32 v34, s97, 31
 ; SI-NEXT:    s_lshr_b32 s7, s20, 16
 ; SI-NEXT:    v_writelane_b32 v37, s6, 2
-; SI-NEXT:    v_writelane_b32 v34, s85, 29
+; SI-NEXT:    v_writelane_b32 v34, s98, 32
 ; SI-NEXT:    s_lshr_b32 s8, s22, 16
 ; SI-NEXT:    v_writelane_b32 v37, s7, 4
-; SI-NEXT:    v_writelane_b32 v34, s86, 30
+; SI-NEXT:    v_writelane_b32 v34, s99, 33
 ; SI-NEXT:    s_lshr_b32 s9, s24, 16
 ; SI-NEXT:    v_writelane_b32 v37, s8, 5
-; SI-NEXT:    v_writelane_b32 v34, s87, 31
+; SI-NEXT:    v_writelane_b32 v34, s30, 34
 ; SI-NEXT:    s_lshr_b32 s10, s26, 16
 ; SI-NEXT:    v_writelane_b32 v37, s9, 6
-; SI-NEXT:    v_writelane_b32 v34, s96, 32
+; SI-NEXT:    v_writelane_b32 v34, s31, 35
 ; SI-NEXT:    s_lshr_b32 s11, s28, 16
 ; SI-NEXT:    v_readfirstlane_b32 s31, v3
 ; SI-NEXT:    v_writelane_b32 v37, s10, 7
-; SI-NEXT:    v_writelane_b32 v34, s97, 33
 ; SI-NEXT:    v_readfirstlane_b32 s37, v5
 ; SI-NEXT:    s_lshr_b32 s12, s31, 16
 ; SI-NEXT:    v_writelane_b32 v37, s11, 8
-; SI-NEXT:    v_writelane_b32 v34, s98, 34
 ; SI-NEXT:    v_readfirstlane_b32 s81, v18
 ; SI-NEXT:    v_readfirstlane_b32 s83, v17
 ; SI-NEXT:    v_readfirstlane_b32 s70, v16
@@ -180416,7 +180417,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s91, v2
 ; SI-NEXT:    v_readfirstlane_b32 s93, v1
 ; SI-NEXT:    v_writelane_b32 v37, s12, 9
-; SI-NEXT:    v_writelane_b32 v34, s99, 35
 ; SI-NEXT:    s_mov_b32 s53, s16
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -181959,6 +181959,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_readlane_b32 s30, v34, 34
 ; SI-NEXT:    v_readlane_b32 s23, v37, 54
 ; SI-NEXT:    v_readlane_b32 s21, v36, 14
 ; SI-NEXT:    v_readlane_b32 s17, v36, 20
@@ -181966,42 +181967,41 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readlane_b32 s11, v36, 38
 ; SI-NEXT:    v_readlane_b32 s9, v37, 1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v34, 35
-; SI-NEXT:    v_readlane_b32 s98, v34, 34
-; SI-NEXT:    v_readlane_b32 s97, v34, 33
-; SI-NEXT:    v_readlane_b32 s96, v34, 32
-; SI-NEXT:    v_readlane_b32 s87, v34, 31
-; SI-NEXT:    v_readlane_b32 s86, v34, 30
-; SI-NEXT:    v_readlane_b32 s85, v34, 29
-; SI-NEXT:    v_readlane_b32 s84, v34, 28
-; SI-NEXT:    v_readlane_b32 s83, v34, 27
-; SI-NEXT:    v_readlane_b32 s82, v34, 26
-; SI-NEXT:    v_readlane_b32 s81, v34, 25
-; SI-NEXT:    v_readlane_b32 s80, v34, 24
-; SI-NEXT:    v_readlane_b32 s71, v34, 23
-; SI-NEXT:    v_readlane_b32 s70, v34, 22
-; SI-NEXT:    v_readlane_b32 s69, v34, 21
-; SI-NEXT:    v_readlane_b32 s68, v34, 20
-; SI-NEXT:    v_readlane_b32 s67, v34, 19
-; SI-NEXT:    v_readlane_b32 s66, v34, 18
-; SI-NEXT:    v_readlane_b32 s65, v34, 17
-; SI-NEXT:    v_readlane_b32 s64, v34, 16
-; SI-NEXT:    v_readlane_b32 s55, v34, 15
-; SI-NEXT:    v_readlane_b32 s54, v34, 14
-; SI-NEXT:    v_readlane_b32 s53, v34, 13
-; SI-NEXT:    v_readlane_b32 s52, v34, 12
-; SI-NEXT:    v_readlane_b32 s51, v34, 11
-; SI-NEXT:    v_readlane_b32 s50, v34, 10
-; SI-NEXT:    v_readlane_b32 s49, v34, 9
-; SI-NEXT:    v_readlane_b32 s48, v34, 8
-; SI-NEXT:    v_readlane_b32 s39, v34, 7
-; SI-NEXT:    v_readlane_b32 s38, v34, 6
-; SI-NEXT:    v_readlane_b32 s37, v34, 5
-; SI-NEXT:    v_readlane_b32 s36, v34, 4
-; SI-NEXT:    v_readlane_b32 s35, v34, 3
-; SI-NEXT:    v_readlane_b32 s34, v34, 2
-; SI-NEXT:    v_readlane_b32 s31, v34, 1
-; SI-NEXT:    v_readlane_b32 s30, v34, 0
+; SI-NEXT:    v_readlane_b32 s31, v34, 35
+; SI-NEXT:    v_readlane_b32 s99, v34, 33
+; SI-NEXT:    v_readlane_b32 s98, v34, 32
+; SI-NEXT:    v_readlane_b32 s97, v34, 31
+; SI-NEXT:    v_readlane_b32 s96, v34, 30
+; SI-NEXT:    v_readlane_b32 s87, v34, 29
+; SI-NEXT:    v_readlane_b32 s86, v34, 28
+; SI-NEXT:    v_readlane_b32 s85, v34, 27
+; SI-NEXT:    v_readlane_b32 s84, v34, 26
+; SI-NEXT:    v_readlane_b32 s83, v34, 25
+; SI-NEXT:    v_readlane_b32 s82, v34, 24
+; SI-NEXT:    v_readlane_b32 s81, v34, 23
+; SI-NEXT:    v_readlane_b32 s80, v34, 22
+; SI-NEXT:    v_readlane_b32 s71, v34, 21
+; SI-NEXT:    v_readlane_b32 s70, v34, 20
+; SI-NEXT:    v_readlane_b32 s69, v34, 19
+; SI-NEXT:    v_readlane_b32 s68, v34, 18
+; SI-NEXT:    v_readlane_b32 s67, v34, 17
+; SI-NEXT:    v_readlane_b32 s66, v34, 16
+; SI-NEXT:    v_readlane_b32 s65, v34, 15
+; SI-NEXT:    v_readlane_b32 s64, v34, 14
+; SI-NEXT:    v_readlane_b32 s55, v34, 13
+; SI-NEXT:    v_readlane_b32 s54, v34, 12
+; SI-NEXT:    v_readlane_b32 s53, v34, 11
+; SI-NEXT:    v_readlane_b32 s52, v34, 10
+; SI-NEXT:    v_readlane_b32 s51, v34, 9
+; SI-NEXT:    v_readlane_b32 s50, v34, 8
+; SI-NEXT:    v_readlane_b32 s49, v34, 7
+; SI-NEXT:    v_readlane_b32 s48, v34, 6
+; SI-NEXT:    v_readlane_b32 s39, v34, 5
+; SI-NEXT:    v_readlane_b32 s38, v34, 4
+; SI-NEXT:    v_readlane_b32 s37, v34, 3
+; SI-NEXT:    v_readlane_b32 s36, v34, 2
+; SI-NEXT:    v_readlane_b32 s35, v34, 1
+; SI-NEXT:    v_readlane_b32 s34, v34, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -182018,39 +182018,39 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
-; VI-NEXT:    v_writelane_b32 v63, s68, 20
-; VI-NEXT:    v_writelane_b32 v63, s69, 21
-; VI-NEXT:    v_writelane_b32 v63, s70, 22
-; VI-NEXT:    v_writelane_b32 v63, s71, 23
-; VI-NEXT:    v_writelane_b32 v63, s80, 24
-; VI-NEXT:    v_writelane_b32 v63, s81, 25
-; VI-NEXT:    v_writelane_b32 v63, s82, 26
-; VI-NEXT:    v_writelane_b32 v63, s83, 27
-; VI-NEXT:    v_writelane_b32 v63, s84, 28
-; VI-NEXT:    v_writelane_b32 v63, s85, 29
-; VI-NEXT:    v_writelane_b32 v63, s86, 30
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s68, 18
+; VI-NEXT:    v_writelane_b32 v63, s69, 19
+; VI-NEXT:    v_writelane_b32 v63, s70, 20
+; VI-NEXT:    v_writelane_b32 v63, s71, 21
+; VI-NEXT:    v_writelane_b32 v63, s80, 22
+; VI-NEXT:    v_writelane_b32 v63, s81, 23
+; VI-NEXT:    v_writelane_b32 v63, s82, 24
+; VI-NEXT:    v_writelane_b32 v63, s83, 25
+; VI-NEXT:    v_writelane_b32 v63, s84, 26
+; VI-NEXT:    v_writelane_b32 v63, s85, 27
+; VI-NEXT:    v_writelane_b32 v63, s86, 28
+; VI-NEXT:    v_writelane_b32 v63, s87, 29
+; VI-NEXT:    v_writelane_b32 v63, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s4, v19
-; VI-NEXT:    v_writelane_b32 v63, s87, 31
+; VI-NEXT:    v_writelane_b32 v63, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v17
 ; VI-NEXT:    v_readfirstlane_b32 s9, v16
@@ -182971,38 +182971,38 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; VI-NEXT:    v_perm_b32 v9, v47, v9, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; VI-NEXT:    v_readlane_b32 s87, v63, 31
-; VI-NEXT:    v_readlane_b32 s86, v63, 30
-; VI-NEXT:    v_readlane_b32 s85, v63, 29
-; VI-NEXT:    v_readlane_b32 s84, v63, 28
-; VI-NEXT:    v_readlane_b32 s83, v63, 27
-; VI-NEXT:    v_readlane_b32 s82, v63, 26
-; VI-NEXT:    v_readlane_b32 s81, v63, 25
-; VI-NEXT:    v_readlane_b32 s80, v63, 24
-; VI-NEXT:    v_readlane_b32 s71, v63, 23
-; VI-NEXT:    v_readlane_b32 s70, v63, 22
-; VI-NEXT:    v_readlane_b32 s69, v63, 21
-; VI-NEXT:    v_readlane_b32 s68, v63, 20
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 30
+; VI-NEXT:    v_readlane_b32 s31, v63, 31
+; VI-NEXT:    v_readlane_b32 s87, v63, 29
+; VI-NEXT:    v_readlane_b32 s86, v63, 28
+; VI-NEXT:    v_readlane_b32 s85, v63, 27
+; VI-NEXT:    v_readlane_b32 s84, v63, 26
+; VI-NEXT:    v_readlane_b32 s83, v63, 25
+; VI-NEXT:    v_readlane_b32 s82, v63, 24
+; VI-NEXT:    v_readlane_b32 s81, v63, 23
+; VI-NEXT:    v_readlane_b32 s80, v63, 22
+; VI-NEXT:    v_readlane_b32 s71, v63, 21
+; VI-NEXT:    v_readlane_b32 s70, v63, 20
+; VI-NEXT:    v_readlane_b32 s69, v63, 19
+; VI-NEXT:    v_readlane_b32 s68, v63, 18
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v56, v23, v56, s4
 ; VI-NEXT:    v_or_b32_e32 v20, v56, v20
@@ -183327,43 +183327,43 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v16
@@ -184228,42 +184228,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_perm_b32 v30, v30, v56, s4
 ; GFX9-NEXT:    v_perm_b32 v27, v27, v46, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v45, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v36, v58, v52, s4
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
@@ -184564,33 +184564,33 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -184612,37 +184612,37 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB95_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s27, 24
@@ -185365,47 +185365,47 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -191686,72 +191686,70 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v43, s29, 0
 ; SI-NEXT:    v_writelane_b32 v43, s28, 1
 ; SI-NEXT:    v_writelane_b32 v43, s27, 2
 ; SI-NEXT:    v_writelane_b32 v43, s26, 3
 ; SI-NEXT:    v_writelane_b32 v43, s25, 4
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
 ; SI-NEXT:    v_writelane_b32 v43, s24, 5
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
 ; SI-NEXT:    v_writelane_b32 v43, s23, 6
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_writelane_b32 v43, s22, 7
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
 ; SI-NEXT:    v_writelane_b32 v43, s21, 8
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
 ; SI-NEXT:    v_writelane_b32 v43, s20, 9
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
 ; SI-NEXT:    v_writelane_b32 v43, s19, 10
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
 ; SI-NEXT:    v_writelane_b32 v43, s18, 11
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
 ; SI-NEXT:    v_writelane_b32 v43, s17, 12
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
 ; SI-NEXT:    v_writelane_b32 v43, s16, 13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v30
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    v_readfirstlane_b32 s4, v29
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v28
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v27
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
 ; SI-NEXT:    v_readfirstlane_b32 s4, v25
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
 ; SI-NEXT:    v_readfirstlane_b32 s4, v24
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
 ; SI-NEXT:    v_writelane_b32 v43, s4, 20
 ; SI-NEXT:    v_readfirstlane_b32 s4, v23
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
 ; SI-NEXT:    v_writelane_b32 v43, s4, 21
 ; SI-NEXT:    v_readfirstlane_b32 s4, v22
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
 ; SI-NEXT:    v_writelane_b32 v43, s4, 22
 ; SI-NEXT:    v_readfirstlane_b32 s4, v21
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
 ; SI-NEXT:    v_writelane_b32 v43, s4, 23
 ; SI-NEXT:    v_readfirstlane_b32 s4, v20
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 24
 ; SI-NEXT:    v_readfirstlane_b32 s4, v19
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
 ; SI-NEXT:    v_writelane_b32 v43, s4, 25
 ; SI-NEXT:    v_readfirstlane_b32 s4, v17
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
 ; SI-NEXT:    v_writelane_b32 v43, s4, 26
 ; SI-NEXT:    v_readfirstlane_b32 s4, v16
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
 ; SI-NEXT:    v_readfirstlane_b32 s71, v18
 ; SI-NEXT:    v_writelane_b32 v43, s4, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v15
@@ -191819,21 +191817,23 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
 ; SI-NEXT:    v_writelane_b32 v43, s4, 38
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s28, v3
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s56, v19
 ; SI-NEXT:    v_readfirstlane_b32 s47, v20
@@ -193147,6 +193147,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s16, s68, 16
 ; SI-NEXT:    s_or_b32 s4, s4, s47
 ; SI-NEXT:    s_or_b32 s5, s5, s16
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s17
 ; SI-NEXT:    v_mov_b32_e32 v1, s18
 ; SI-NEXT:    v_mov_b32_e32 v2, s19
@@ -193179,42 +193180,41 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
@@ -201087,61 +201087,61 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_writelane_b32 v20, s30, 0
-; SI-NEXT:    v_writelane_b32 v20, s31, 1
-; SI-NEXT:    v_writelane_b32 v20, s34, 2
-; SI-NEXT:    v_writelane_b32 v20, s35, 3
-; SI-NEXT:    v_writelane_b32 v20, s36, 4
-; SI-NEXT:    v_writelane_b32 v20, s37, 5
-; SI-NEXT:    v_writelane_b32 v20, s38, 6
-; SI-NEXT:    v_writelane_b32 v20, s39, 7
-; SI-NEXT:    v_writelane_b32 v20, s48, 8
-; SI-NEXT:    v_writelane_b32 v20, s49, 9
-; SI-NEXT:    v_writelane_b32 v20, s50, 10
-; SI-NEXT:    v_writelane_b32 v20, s51, 11
-; SI-NEXT:    v_writelane_b32 v20, s52, 12
-; SI-NEXT:    v_writelane_b32 v20, s53, 13
-; SI-NEXT:    v_writelane_b32 v20, s54, 14
-; SI-NEXT:    v_writelane_b32 v20, s55, 15
-; SI-NEXT:    v_writelane_b32 v20, s64, 16
-; SI-NEXT:    v_writelane_b32 v20, s65, 17
-; SI-NEXT:    v_writelane_b32 v20, s66, 18
-; SI-NEXT:    v_writelane_b32 v20, s67, 19
-; SI-NEXT:    v_writelane_b32 v20, s68, 20
-; SI-NEXT:    v_writelane_b32 v20, s69, 21
-; SI-NEXT:    v_writelane_b32 v20, s70, 22
-; SI-NEXT:    v_writelane_b32 v20, s71, 23
-; SI-NEXT:    v_writelane_b32 v20, s80, 24
-; SI-NEXT:    v_writelane_b32 v20, s81, 25
-; SI-NEXT:    v_writelane_b32 v20, s82, 26
-; SI-NEXT:    v_writelane_b32 v20, s83, 27
+; SI-NEXT:    v_writelane_b32 v20, s34, 0
+; SI-NEXT:    v_writelane_b32 v20, s35, 1
+; SI-NEXT:    v_writelane_b32 v20, s36, 2
+; SI-NEXT:    v_writelane_b32 v20, s37, 3
+; SI-NEXT:    v_writelane_b32 v20, s38, 4
+; SI-NEXT:    v_writelane_b32 v20, s39, 5
+; SI-NEXT:    v_writelane_b32 v20, s48, 6
+; SI-NEXT:    v_writelane_b32 v20, s49, 7
+; SI-NEXT:    v_writelane_b32 v20, s50, 8
+; SI-NEXT:    v_writelane_b32 v20, s51, 9
+; SI-NEXT:    v_writelane_b32 v20, s52, 10
+; SI-NEXT:    v_writelane_b32 v20, s53, 11
+; SI-NEXT:    v_writelane_b32 v20, s54, 12
+; SI-NEXT:    v_writelane_b32 v20, s55, 13
+; SI-NEXT:    v_writelane_b32 v20, s64, 14
+; SI-NEXT:    v_writelane_b32 v20, s65, 15
+; SI-NEXT:    v_writelane_b32 v20, s66, 16
+; SI-NEXT:    v_writelane_b32 v20, s67, 17
+; SI-NEXT:    v_writelane_b32 v20, s68, 18
+; SI-NEXT:    v_writelane_b32 v20, s69, 19
+; SI-NEXT:    v_writelane_b32 v20, s70, 20
+; SI-NEXT:    v_writelane_b32 v20, s71, 21
+; SI-NEXT:    v_writelane_b32 v20, s80, 22
+; SI-NEXT:    v_writelane_b32 v20, s81, 23
+; SI-NEXT:    v_writelane_b32 v20, s82, 24
+; SI-NEXT:    v_writelane_b32 v20, s83, 25
+; SI-NEXT:    v_writelane_b32 v20, s84, 26
+; SI-NEXT:    v_writelane_b32 v20, s85, 27
 ; SI-NEXT:    v_readfirstlane_b32 s37, v1
-; SI-NEXT:    v_writelane_b32 v20, s84, 28
+; SI-NEXT:    v_writelane_b32 v20, s86, 28
 ; SI-NEXT:    v_readfirstlane_b32 s39, v3
 ; SI-NEXT:    s_lshr_b32 s6, s37, 16
 ; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v20, s85, 29
+; SI-NEXT:    v_writelane_b32 v20, s87, 29
 ; SI-NEXT:    v_readfirstlane_b32 s49, v5
 ; SI-NEXT:    s_lshr_b32 s7, s39, 16
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_writelane_b32 v21, s6, 0
-; SI-NEXT:    v_writelane_b32 v20, s86, 30
+; SI-NEXT:    v_writelane_b32 v20, s96, 30
 ; SI-NEXT:    v_readfirstlane_b32 s51, v7
 ; SI-NEXT:    s_lshr_b32 s8, s49, 16
 ; SI-NEXT:    v_writelane_b32 v21, s7, 1
-; SI-NEXT:    v_writelane_b32 v20, s87, 31
+; SI-NEXT:    v_writelane_b32 v20, s97, 31
 ; SI-NEXT:    v_readfirstlane_b32 s53, v9
 ; SI-NEXT:    s_lshr_b32 s9, s51, 16
 ; SI-NEXT:    v_writelane_b32 v21, s8, 2
-; SI-NEXT:    v_writelane_b32 v20, s96, 32
+; SI-NEXT:    v_writelane_b32 v20, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s55, v11
 ; SI-NEXT:    s_lshr_b32 s10, s53, 16
 ; SI-NEXT:    v_writelane_b32 v21, s9, 3
-; SI-NEXT:    v_writelane_b32 v20, s97, 33
+; SI-NEXT:    v_writelane_b32 v20, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s65, v13
 ; SI-NEXT:    s_lshr_b32 s11, s55, 16
 ; SI-NEXT:    v_writelane_b32 v21, s10, 4
-; SI-NEXT:    v_writelane_b32 v20, s98, 34
+; SI-NEXT:    v_writelane_b32 v20, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s68, v18
 ; SI-NEXT:    v_readfirstlane_b32 s69, v17
 ; SI-NEXT:    v_readfirstlane_b32 s66, v16
@@ -201155,7 +201155,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s38, v4
 ; SI-NEXT:    v_readfirstlane_b32 s36, v2
 ; SI-NEXT:    v_writelane_b32 v21, s11, 5
-; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    v_writelane_b32 v20, s31, 35
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s99, s28, 16
 ; SI-NEXT:    s_lshr_b32 s93, s27, 16
@@ -202381,6 +202381,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v20, 34
 ; SI-NEXT:    v_readlane_b32 s19, v21, 62
 ; SI-NEXT:    v_readlane_b32 s17, v22, 4
 ; SI-NEXT:    v_readlane_b32 s15, v22, 10
@@ -202388,42 +202389,41 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readlane_b32 s11, v22, 22
 ; SI-NEXT:    v_readlane_b32 s9, v22, 26
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v20, 35
-; SI-NEXT:    v_readlane_b32 s98, v20, 34
-; SI-NEXT:    v_readlane_b32 s97, v20, 33
-; SI-NEXT:    v_readlane_b32 s96, v20, 32
-; SI-NEXT:    v_readlane_b32 s87, v20, 31
-; SI-NEXT:    v_readlane_b32 s86, v20, 30
-; SI-NEXT:    v_readlane_b32 s85, v20, 29
-; SI-NEXT:    v_readlane_b32 s84, v20, 28
-; SI-NEXT:    v_readlane_b32 s83, v20, 27
-; SI-NEXT:    v_readlane_b32 s82, v20, 26
-; SI-NEXT:    v_readlane_b32 s81, v20, 25
-; SI-NEXT:    v_readlane_b32 s80, v20, 24
-; SI-NEXT:    v_readlane_b32 s71, v20, 23
-; SI-NEXT:    v_readlane_b32 s70, v20, 22
-; SI-NEXT:    v_readlane_b32 s69, v20, 21
-; SI-NEXT:    v_readlane_b32 s68, v20, 20
-; SI-NEXT:    v_readlane_b32 s67, v20, 19
-; SI-NEXT:    v_readlane_b32 s66, v20, 18
-; SI-NEXT:    v_readlane_b32 s65, v20, 17
-; SI-NEXT:    v_readlane_b32 s64, v20, 16
-; SI-NEXT:    v_readlane_b32 s55, v20, 15
-; SI-NEXT:    v_readlane_b32 s54, v20, 14
-; SI-NEXT:    v_readlane_b32 s53, v20, 13
-; SI-NEXT:    v_readlane_b32 s52, v20, 12
-; SI-NEXT:    v_readlane_b32 s51, v20, 11
-; SI-NEXT:    v_readlane_b32 s50, v20, 10
-; SI-NEXT:    v_readlane_b32 s49, v20, 9
-; SI-NEXT:    v_readlane_b32 s48, v20, 8
-; SI-NEXT:    v_readlane_b32 s39, v20, 7
-; SI-NEXT:    v_readlane_b32 s38, v20, 6
-; SI-NEXT:    v_readlane_b32 s37, v20, 5
-; SI-NEXT:    v_readlane_b32 s36, v20, 4
-; SI-NEXT:    v_readlane_b32 s35, v20, 3
-; SI-NEXT:    v_readlane_b32 s34, v20, 2
-; SI-NEXT:    v_readlane_b32 s31, v20, 1
-; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    v_readlane_b32 s31, v20, 35
+; SI-NEXT:    v_readlane_b32 s99, v20, 33
+; SI-NEXT:    v_readlane_b32 s98, v20, 32
+; SI-NEXT:    v_readlane_b32 s97, v20, 31
+; SI-NEXT:    v_readlane_b32 s96, v20, 30
+; SI-NEXT:    v_readlane_b32 s87, v20, 29
+; SI-NEXT:    v_readlane_b32 s86, v20, 28
+; SI-NEXT:    v_readlane_b32 s85, v20, 27
+; SI-NEXT:    v_readlane_b32 s84, v20, 26
+; SI-NEXT:    v_readlane_b32 s83, v20, 25
+; SI-NEXT:    v_readlane_b32 s82, v20, 24
+; SI-NEXT:    v_readlane_b32 s81, v20, 23
+; SI-NEXT:    v_readlane_b32 s80, v20, 22
+; SI-NEXT:    v_readlane_b32 s71, v20, 21
+; SI-NEXT:    v_readlane_b32 s70, v20, 20
+; SI-NEXT:    v_readlane_b32 s69, v20, 19
+; SI-NEXT:    v_readlane_b32 s68, v20, 18
+; SI-NEXT:    v_readlane_b32 s67, v20, 17
+; SI-NEXT:    v_readlane_b32 s66, v20, 16
+; SI-NEXT:    v_readlane_b32 s65, v20, 15
+; SI-NEXT:    v_readlane_b32 s64, v20, 14
+; SI-NEXT:    v_readlane_b32 s55, v20, 13
+; SI-NEXT:    v_readlane_b32 s54, v20, 12
+; SI-NEXT:    v_readlane_b32 s53, v20, 11
+; SI-NEXT:    v_readlane_b32 s52, v20, 10
+; SI-NEXT:    v_readlane_b32 s51, v20, 9
+; SI-NEXT:    v_readlane_b32 s50, v20, 8
+; SI-NEXT:    v_readlane_b32 s49, v20, 7
+; SI-NEXT:    v_readlane_b32 s48, v20, 6
+; SI-NEXT:    v_readlane_b32 s39, v20, 5
+; SI-NEXT:    v_readlane_b32 s38, v20, 4
+; SI-NEXT:    v_readlane_b32 s37, v20, 3
+; SI-NEXT:    v_readlane_b32 s36, v20, 2
+; SI-NEXT:    v_readlane_b32 s35, v20, 1
+; SI-NEXT:    v_readlane_b32 s34, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -202652,38 +202652,38 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    v_readfirstlane_b32 s44, v19
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s5, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v17
 ; VI-NEXT:    v_readfirstlane_b32 s7, v16
@@ -202703,7 +202703,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s45, v2
 ; VI-NEXT:    s_cmp_lg_u32 s44, 0
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
 ; VI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -203491,39 +203491,39 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -203694,43 +203694,43 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v63, s64, 16
-; GFX9-NEXT:    v_writelane_b32 v63, s65, 17
-; GFX9-NEXT:    v_writelane_b32 v63, s66, 18
-; GFX9-NEXT:    v_writelane_b32 v63, s67, 19
-; GFX9-NEXT:    v_writelane_b32 v63, s68, 20
-; GFX9-NEXT:    v_writelane_b32 v63, s69, 21
-; GFX9-NEXT:    v_writelane_b32 v63, s70, 22
-; GFX9-NEXT:    v_writelane_b32 v63, s71, 23
-; GFX9-NEXT:    v_writelane_b32 v63, s80, 24
-; GFX9-NEXT:    v_writelane_b32 v63, s81, 25
-; GFX9-NEXT:    v_writelane_b32 v63, s82, 26
-; GFX9-NEXT:    v_writelane_b32 v63, s83, 27
-; GFX9-NEXT:    v_writelane_b32 v63, s84, 28
-; GFX9-NEXT:    v_writelane_b32 v63, s85, 29
-; GFX9-NEXT:    v_writelane_b32 v63, s86, 30
-; GFX9-NEXT:    v_writelane_b32 v63, s87, 31
-; GFX9-NEXT:    v_writelane_b32 v63, s96, 32
-; GFX9-NEXT:    v_writelane_b32 v63, s97, 33
-; GFX9-NEXT:    v_writelane_b32 v63, s98, 34
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s66, 16
+; GFX9-NEXT:    v_writelane_b32 v63, s67, 17
+; GFX9-NEXT:    v_writelane_b32 v63, s68, 18
+; GFX9-NEXT:    v_writelane_b32 v63, s69, 19
+; GFX9-NEXT:    v_writelane_b32 v63, s70, 20
+; GFX9-NEXT:    v_writelane_b32 v63, s71, 21
+; GFX9-NEXT:    v_writelane_b32 v63, s80, 22
+; GFX9-NEXT:    v_writelane_b32 v63, s81, 23
+; GFX9-NEXT:    v_writelane_b32 v63, s82, 24
+; GFX9-NEXT:    v_writelane_b32 v63, s83, 25
+; GFX9-NEXT:    v_writelane_b32 v63, s84, 26
+; GFX9-NEXT:    v_writelane_b32 v63, s85, 27
+; GFX9-NEXT:    v_writelane_b32 v63, s86, 28
+; GFX9-NEXT:    v_writelane_b32 v63, s87, 29
+; GFX9-NEXT:    v_writelane_b32 v63, s96, 30
+; GFX9-NEXT:    v_writelane_b32 v63, s97, 31
+; GFX9-NEXT:    v_writelane_b32 v63, s98, 32
+; GFX9-NEXT:    v_writelane_b32 v63, s99, 33
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 34
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v19
-; GFX9-NEXT:    v_writelane_b32 v63, s99, 35
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 35
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v16
@@ -204594,42 +204594,42 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_perm_b32 v30, v30, v56, s4
 ; GFX9-NEXT:    v_perm_b32 v27, v27, v46, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v45, s4
-; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
-; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
-; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
-; GFX9-NEXT:    v_readlane_b32 s96, v63, 32
-; GFX9-NEXT:    v_readlane_b32 s87, v63, 31
-; GFX9-NEXT:    v_readlane_b32 s86, v63, 30
-; GFX9-NEXT:    v_readlane_b32 s85, v63, 29
-; GFX9-NEXT:    v_readlane_b32 s84, v63, 28
-; GFX9-NEXT:    v_readlane_b32 s83, v63, 27
-; GFX9-NEXT:    v_readlane_b32 s82, v63, 26
-; GFX9-NEXT:    v_readlane_b32 s81, v63, 25
-; GFX9-NEXT:    v_readlane_b32 s80, v63, 24
-; GFX9-NEXT:    v_readlane_b32 s71, v63, 23
-; GFX9-NEXT:    v_readlane_b32 s70, v63, 22
-; GFX9-NEXT:    v_readlane_b32 s69, v63, 21
-; GFX9-NEXT:    v_readlane_b32 s68, v63, 20
-; GFX9-NEXT:    v_readlane_b32 s67, v63, 19
-; GFX9-NEXT:    v_readlane_b32 s66, v63, 18
-; GFX9-NEXT:    v_readlane_b32 s65, v63, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v63, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 34
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 35
+; GFX9-NEXT:    v_readlane_b32 s99, v63, 33
+; GFX9-NEXT:    v_readlane_b32 s98, v63, 32
+; GFX9-NEXT:    v_readlane_b32 s97, v63, 31
+; GFX9-NEXT:    v_readlane_b32 s96, v63, 30
+; GFX9-NEXT:    v_readlane_b32 s87, v63, 29
+; GFX9-NEXT:    v_readlane_b32 s86, v63, 28
+; GFX9-NEXT:    v_readlane_b32 s85, v63, 27
+; GFX9-NEXT:    v_readlane_b32 s84, v63, 26
+; GFX9-NEXT:    v_readlane_b32 s83, v63, 25
+; GFX9-NEXT:    v_readlane_b32 s82, v63, 24
+; GFX9-NEXT:    v_readlane_b32 s81, v63, 23
+; GFX9-NEXT:    v_readlane_b32 s80, v63, 22
+; GFX9-NEXT:    v_readlane_b32 s71, v63, 21
+; GFX9-NEXT:    v_readlane_b32 s70, v63, 20
+; GFX9-NEXT:    v_readlane_b32 s69, v63, 19
+; GFX9-NEXT:    v_readlane_b32 s68, v63, 18
+; GFX9-NEXT:    v_readlane_b32 s67, v63, 17
+; GFX9-NEXT:    v_readlane_b32 s66, v63, 16
+; GFX9-NEXT:    v_readlane_b32 s65, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v36, v58, v52, s4
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
@@ -204930,33 +204930,33 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v74, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v75, s96, 0
+; GFX11-NEXT:    v_writelane_b32 v74, s34, 0
+; GFX11-NEXT:    v_writelane_b32 v75, s98, 0
 ; GFX11-NEXT:    v_readfirstlane_b32 s42, v15
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-NEXT:    v_writelane_b32 v74, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v75, s97, 1
+; GFX11-NEXT:    v_writelane_b32 v74, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v75, s99, 1
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v12
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v11
 ; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_writelane_b32 v74, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v75, s98, 2
+; GFX11-NEXT:    v_writelane_b32 v74, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v75, s100, 2
 ; GFX11-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s11, v8
 ; GFX11-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX11-NEXT:    v_writelane_b32 v74, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v75, s99, 3
+; GFX11-NEXT:    v_writelane_b32 v74, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v75, s101, 3
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v6
 ; GFX11-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s15, v4
-; GFX11-NEXT:    v_writelane_b32 v74, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v75, s100, 4
+; GFX11-NEXT:    v_writelane_b32 v74, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v75, s102, 4
 ; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX11-NEXT:    v_readfirstlane_b32 s41, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s40, v1
-; GFX11-NEXT:    v_writelane_b32 v74, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v75, s101, 5
+; GFX11-NEXT:    v_writelane_b32 v74, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v75, s103, 5
 ; GFX11-NEXT:    s_cmp_lg_u32 s42, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_clause 0x11 ; 72-byte Folded Spill
@@ -204978,37 +204978,37 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v73, s32
-; GFX11-NEXT:    v_writelane_b32 v74, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v75, s102, 6
+; GFX11-NEXT:    v_writelane_b32 v74, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v75, s104, 6
 ; GFX11-NEXT:    ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
 ; GFX11-NEXT:    ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT:    v_writelane_b32 v74, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v75, s103, 7
-; GFX11-NEXT:    v_writelane_b32 v74, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v75, s104, 8
-; GFX11-NEXT:    v_writelane_b32 v74, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v74, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v74, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v74, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v74, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v74, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v74, s55, 15
-; GFX11-NEXT:    v_writelane_b32 v74, s64, 16
-; GFX11-NEXT:    v_writelane_b32 v74, s65, 17
-; GFX11-NEXT:    v_writelane_b32 v74, s66, 18
-; GFX11-NEXT:    v_writelane_b32 v74, s67, 19
-; GFX11-NEXT:    v_writelane_b32 v74, s68, 20
-; GFX11-NEXT:    v_writelane_b32 v74, s69, 21
-; GFX11-NEXT:    v_writelane_b32 v74, s70, 22
-; GFX11-NEXT:    v_writelane_b32 v74, s71, 23
-; GFX11-NEXT:    v_writelane_b32 v74, s80, 24
-; GFX11-NEXT:    v_writelane_b32 v74, s81, 25
-; GFX11-NEXT:    v_writelane_b32 v74, s82, 26
-; GFX11-NEXT:    v_writelane_b32 v74, s83, 27
-; GFX11-NEXT:    v_writelane_b32 v74, s84, 28
-; GFX11-NEXT:    v_writelane_b32 v74, s85, 29
-; GFX11-NEXT:    v_writelane_b32 v74, s86, 30
-; GFX11-NEXT:    v_writelane_b32 v74, s87, 31
+; GFX11-NEXT:    v_writelane_b32 v74, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v75, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v74, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v75, s31, 8
+; GFX11-NEXT:    v_writelane_b32 v74, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v74, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v74, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v74, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v74, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v74, s64, 14
+; GFX11-NEXT:    v_writelane_b32 v74, s65, 15
+; GFX11-NEXT:    v_writelane_b32 v74, s66, 16
+; GFX11-NEXT:    v_writelane_b32 v74, s67, 17
+; GFX11-NEXT:    v_writelane_b32 v74, s68, 18
+; GFX11-NEXT:    v_writelane_b32 v74, s69, 19
+; GFX11-NEXT:    v_writelane_b32 v74, s70, 20
+; GFX11-NEXT:    v_writelane_b32 v74, s71, 21
+; GFX11-NEXT:    v_writelane_b32 v74, s80, 22
+; GFX11-NEXT:    v_writelane_b32 v74, s81, 23
+; GFX11-NEXT:    v_writelane_b32 v74, s82, 24
+; GFX11-NEXT:    v_writelane_b32 v74, s83, 25
+; GFX11-NEXT:    v_writelane_b32 v74, s84, 26
+; GFX11-NEXT:    v_writelane_b32 v74, s85, 27
+; GFX11-NEXT:    v_writelane_b32 v74, s86, 28
+; GFX11-NEXT:    v_writelane_b32 v74, s87, 29
+; GFX11-NEXT:    v_writelane_b32 v74, s96, 30
+; GFX11-NEXT:    v_writelane_b32 v74, s97, 31
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB99_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s27, 24
@@ -205731,47 +205731,47 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT:    v_readlane_b32 s104, v75, 8
-; GFX11-NEXT:    v_readlane_b32 s103, v75, 7
-; GFX11-NEXT:    v_readlane_b32 s102, v75, 6
-; GFX11-NEXT:    v_readlane_b32 s101, v75, 5
-; GFX11-NEXT:    v_readlane_b32 s100, v75, 4
-; GFX11-NEXT:    v_readlane_b32 s99, v75, 3
-; GFX11-NEXT:    v_readlane_b32 s98, v75, 2
-; GFX11-NEXT:    v_readlane_b32 s97, v75, 1
-; GFX11-NEXT:    v_readlane_b32 s96, v75, 0
-; GFX11-NEXT:    v_readlane_b32 s87, v74, 31
-; GFX11-NEXT:    v_readlane_b32 s86, v74, 30
-; GFX11-NEXT:    v_readlane_b32 s85, v74, 29
-; GFX11-NEXT:    v_readlane_b32 s84, v74, 28
-; GFX11-NEXT:    v_readlane_b32 s83, v74, 27
-; GFX11-NEXT:    v_readlane_b32 s82, v74, 26
-; GFX11-NEXT:    v_readlane_b32 s81, v74, 25
-; GFX11-NEXT:    v_readlane_b32 s80, v74, 24
-; GFX11-NEXT:    v_readlane_b32 s71, v74, 23
-; GFX11-NEXT:    v_readlane_b32 s70, v74, 22
-; GFX11-NEXT:    v_readlane_b32 s69, v74, 21
-; GFX11-NEXT:    v_readlane_b32 s68, v74, 20
-; GFX11-NEXT:    v_readlane_b32 s67, v74, 19
-; GFX11-NEXT:    v_readlane_b32 s66, v74, 18
-; GFX11-NEXT:    v_readlane_b32 s65, v74, 17
-; GFX11-NEXT:    v_readlane_b32 s64, v74, 16
-; GFX11-NEXT:    v_readlane_b32 s55, v74, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v74, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v74, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v74, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v74, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v74, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v74, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v74, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v74, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v74, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v74, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v74, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v74, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v74, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v74, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v74, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v75, 7
+; GFX11-NEXT:    v_readlane_b32 s31, v75, 8
+; GFX11-NEXT:    v_readlane_b32 s104, v75, 6
+; GFX11-NEXT:    v_readlane_b32 s103, v75, 5
+; GFX11-NEXT:    v_readlane_b32 s102, v75, 4
+; GFX11-NEXT:    v_readlane_b32 s101, v75, 3
+; GFX11-NEXT:    v_readlane_b32 s100, v75, 2
+; GFX11-NEXT:    v_readlane_b32 s99, v75, 1
+; GFX11-NEXT:    v_readlane_b32 s98, v75, 0
+; GFX11-NEXT:    v_readlane_b32 s97, v74, 31
+; GFX11-NEXT:    v_readlane_b32 s96, v74, 30
+; GFX11-NEXT:    v_readlane_b32 s87, v74, 29
+; GFX11-NEXT:    v_readlane_b32 s86, v74, 28
+; GFX11-NEXT:    v_readlane_b32 s85, v74, 27
+; GFX11-NEXT:    v_readlane_b32 s84, v74, 26
+; GFX11-NEXT:    v_readlane_b32 s83, v74, 25
+; GFX11-NEXT:    v_readlane_b32 s82, v74, 24
+; GFX11-NEXT:    v_readlane_b32 s81, v74, 23
+; GFX11-NEXT:    v_readlane_b32 s80, v74, 22
+; GFX11-NEXT:    v_readlane_b32 s71, v74, 21
+; GFX11-NEXT:    v_readlane_b32 s70, v74, 20
+; GFX11-NEXT:    v_readlane_b32 s69, v74, 19
+; GFX11-NEXT:    v_readlane_b32 s68, v74, 18
+; GFX11-NEXT:    v_readlane_b32 s67, v74, 17
+; GFX11-NEXT:    v_readlane_b32 s66, v74, 16
+; GFX11-NEXT:    v_readlane_b32 s65, v74, 15
+; GFX11-NEXT:    v_readlane_b32 s64, v74, 14
+; GFX11-NEXT:    v_readlane_b32 s55, v74, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v74, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v74, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v74, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v74, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v74, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v74, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v74, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v74, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v74, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v74, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v74, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v74, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v74, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:72
@@ -209268,16 +209268,18 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; SI-NEXT:    v_readfirstlane_b32 s92, v4
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    v_readfirstlane_b32 s78, v7
 ; SI-NEXT:    s_and_b32 s93, s92, 0xffff0000
 ; SI-NEXT:    s_and_b32 s34, vcc_lo, 0xffff0000
@@ -209336,14 +209338,12 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_readfirstlane_b32 s56, v14
 ; SI-NEXT:    s_and_b32 s57, s56, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s56, s56, 16
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    v_readfirstlane_b32 s58, v13
 ; SI-NEXT:    v_readfirstlane_b32 s62, v11
 ; SI-NEXT:    v_readfirstlane_b32 s74, v9
 ; SI-NEXT:    v_readfirstlane_b32 s88, v6
 ; SI-NEXT:    v_readfirstlane_b32 s90, v5
 ; SI-NEXT:    v_readfirstlane_b32 s94, v3
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_lshl_b32 s7, s28, 16
@@ -209367,13 +209367,13 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    s_lshl_b32 s42, s42, 16
 ; SI-NEXT:    s_lshl_b32 s44, s44, 16
 ; SI-NEXT:    s_lshl_b32 s46, s46, 16
+; SI-NEXT:    s_and_b32 s59, s58, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s58, s58, 16
+; SI-NEXT:    s_and_b32 s63, s62, 0xffff0000
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s11
-; SI-NEXT:    s_and_b32 s59, s58, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s58, s58, 16
-; SI-NEXT:    s_and_b32 s63, s62, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s62, s62, 16
 ; SI-NEXT:    s_and_b32 s75, s74, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s74, s74, 16
@@ -209428,14 +209428,14 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s73
 ; SI-NEXT:    v_mul_f32_e64 v56, 1.0, s63
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s61
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s31
 ; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s59
 ; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s13
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s31
 ; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s9
 ; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s5
 ; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s35
@@ -210442,12 +210442,12 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v58
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v12
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -210848,17 +210848,17 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v35, s30, 0
-; VI-NEXT:    v_writelane_b32 v35, s31, 1
-; VI-NEXT:    v_writelane_b32 v35, s36, 2
-; VI-NEXT:    v_writelane_b32 v35, s37, 3
-; VI-NEXT:    v_writelane_b32 v35, s38, 4
-; VI-NEXT:    v_writelane_b32 v35, s39, 5
-; VI-NEXT:    v_writelane_b32 v35, s48, 6
-; VI-NEXT:    v_writelane_b32 v35, s49, 7
-; VI-NEXT:    v_writelane_b32 v35, s50, 8
+; VI-NEXT:    v_writelane_b32 v35, s36, 0
+; VI-NEXT:    v_writelane_b32 v35, s37, 1
+; VI-NEXT:    v_writelane_b32 v35, s38, 2
+; VI-NEXT:    v_writelane_b32 v35, s39, 3
+; VI-NEXT:    v_writelane_b32 v35, s48, 4
+; VI-NEXT:    v_writelane_b32 v35, s49, 5
+; VI-NEXT:    v_writelane_b32 v35, s50, 6
+; VI-NEXT:    v_writelane_b32 v35, s51, 7
+; VI-NEXT:    v_writelane_b32 v35, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v35, s51, 9
+; VI-NEXT:    v_writelane_b32 v35, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -211511,16 +211511,16 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB101_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v35, 9
-; VI-NEXT:    v_readlane_b32 s50, v35, 8
-; VI-NEXT:    v_readlane_b32 s49, v35, 7
-; VI-NEXT:    v_readlane_b32 s48, v35, 6
-; VI-NEXT:    v_readlane_b32 s39, v35, 5
-; VI-NEXT:    v_readlane_b32 s38, v35, 4
-; VI-NEXT:    v_readlane_b32 s37, v35, 3
-; VI-NEXT:    v_readlane_b32 s36, v35, 2
-; VI-NEXT:    v_readlane_b32 s31, v35, 1
-; VI-NEXT:    v_readlane_b32 s30, v35, 0
+; VI-NEXT:    v_readlane_b32 s30, v35, 8
+; VI-NEXT:    v_readlane_b32 s31, v35, 9
+; VI-NEXT:    v_readlane_b32 s51, v35, 7
+; VI-NEXT:    v_readlane_b32 s50, v35, 6
+; VI-NEXT:    v_readlane_b32 s49, v35, 5
+; VI-NEXT:    v_readlane_b32 s48, v35, 4
+; VI-NEXT:    v_readlane_b32 s39, v35, 3
+; VI-NEXT:    v_readlane_b32 s38, v35, 2
+; VI-NEXT:    v_readlane_b32 s37, v35, 1
+; VI-NEXT:    v_readlane_b32 s36, v35, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -211533,17 +211533,17 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v36, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v36, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v36, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v36, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v36, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v36, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v36, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v36, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v36, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v36, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v36, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v36, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v36, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v36, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v36, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v36, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v36, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v36, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v36, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v36, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -212214,16 +212214,16 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB101_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v36, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v36, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v36, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v36, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v36, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v36, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v36, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v36, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v36, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v36, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v36, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v36, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v36, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v36, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v36, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v36, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v36, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v36, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v36, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v36, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -215006,55 +215006,57 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s48, 6
+; SI-NEXT:    v_writelane_b32 v63, s49, 7
+; SI-NEXT:    v_writelane_b32 v63, s50, 8
+; SI-NEXT:    v_writelane_b32 v63, s51, 9
+; SI-NEXT:    v_writelane_b32 v63, s52, 10
+; SI-NEXT:    v_writelane_b32 v63, s53, 11
+; SI-NEXT:    v_writelane_b32 v63, s54, 12
+; SI-NEXT:    v_writelane_b32 v63, s55, 13
+; SI-NEXT:    v_writelane_b32 v63, s64, 14
+; SI-NEXT:    v_writelane_b32 v63, s65, 15
+; SI-NEXT:    v_writelane_b32 v63, s66, 16
+; SI-NEXT:    v_writelane_b32 v63, s67, 17
+; SI-NEXT:    v_writelane_b32 v63, s68, 18
+; SI-NEXT:    v_writelane_b32 v63, s69, 19
+; SI-NEXT:    v_writelane_b32 v63, s70, 20
+; SI-NEXT:    v_writelane_b32 v63, s71, 21
+; SI-NEXT:    v_writelane_b32 v63, s80, 22
+; SI-NEXT:    v_writelane_b32 v63, s81, 23
+; SI-NEXT:    v_writelane_b32 v63, s82, 24
+; SI-NEXT:    v_writelane_b32 v63, s83, 25
+; SI-NEXT:    v_writelane_b32 v63, s84, 26
+; SI-NEXT:    v_writelane_b32 v63, s85, 27
+; SI-NEXT:    v_writelane_b32 v63, s86, 28
 ; SI-NEXT:    s_lshr_b32 s5, s16, 16
 ; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
+; SI-NEXT:    v_writelane_b32 v63, s87, 29
 ; SI-NEXT:    s_lshr_b32 s6, s17, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v62, s5, 0
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
+; SI-NEXT:    v_writelane_b32 v63, s96, 30
 ; SI-NEXT:    s_lshr_b32 s7, s18, 16
 ; SI-NEXT:    v_writelane_b32 v62, s6, 1
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
+; SI-NEXT:    v_writelane_b32 v63, s97, 31
 ; SI-NEXT:    s_lshr_b32 s8, s19, 16
 ; SI-NEXT:    v_writelane_b32 v62, s7, 2
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
+; SI-NEXT:    v_writelane_b32 v63, s98, 32
 ; SI-NEXT:    s_lshr_b32 s9, s20, 16
 ; SI-NEXT:    v_writelane_b32 v62, s8, 3
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
+; SI-NEXT:    v_writelane_b32 v63, s99, 33
 ; SI-NEXT:    s_lshr_b32 s10, s21, 16
 ; SI-NEXT:    v_writelane_b32 v62, s9, 4
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
+; SI-NEXT:    v_writelane_b32 v63, s30, 34
 ; SI-NEXT:    s_lshr_b32 s11, s22, 16
 ; SI-NEXT:    v_writelane_b32 v62, s10, 5
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
+; SI-NEXT:    v_writelane_b32 v63, s31, 35
 ; SI-NEXT:    s_lshr_b32 s12, s23, 16
 ; SI-NEXT:    v_readfirstlane_b32 s52, v17
 ; SI-NEXT:    v_readfirstlane_b32 s48, v16
@@ -215075,7 +215077,6 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_readfirstlane_b32 s46, v1
 ; SI-NEXT:    v_readfirstlane_b32 s44, v0
 ; SI-NEXT:    v_writelane_b32 v62, s11, 6
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
 ; SI-NEXT:    s_lshr_b32 s43, s29, 16
 ; SI-NEXT:    s_lshr_b32 s42, s28, 16
 ; SI-NEXT:    s_lshr_b32 s41, s27, 16
@@ -215102,7 +215103,6 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    s_lshr_b32 s45, s44, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
 ; SI-NEXT:    v_writelane_b32 v62, s12, 7
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_writelane_b32 v62, s13, 8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -215840,42 +215840,42 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v40
 ; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v54
 ; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 34
+; SI-NEXT:    v_readlane_b32 s31, v63, 35
+; SI-NEXT:    v_readlane_b32 s99, v63, 33
+; SI-NEXT:    v_readlane_b32 s98, v63, 32
+; SI-NEXT:    v_readlane_b32 s97, v63, 31
+; SI-NEXT:    v_readlane_b32 s96, v63, 30
+; SI-NEXT:    v_readlane_b32 s87, v63, 29
+; SI-NEXT:    v_readlane_b32 s86, v63, 28
+; SI-NEXT:    v_readlane_b32 s85, v63, 27
+; SI-NEXT:    v_readlane_b32 s84, v63, 26
+; SI-NEXT:    v_readlane_b32 s83, v63, 25
+; SI-NEXT:    v_readlane_b32 s82, v63, 24
+; SI-NEXT:    v_readlane_b32 s81, v63, 23
+; SI-NEXT:    v_readlane_b32 s80, v63, 22
+; SI-NEXT:    v_readlane_b32 s71, v63, 21
+; SI-NEXT:    v_readlane_b32 s70, v63, 20
+; SI-NEXT:    v_readlane_b32 s69, v63, 19
+; SI-NEXT:    v_readlane_b32 s68, v63, 18
+; SI-NEXT:    v_readlane_b32 s67, v63, 17
+; SI-NEXT:    v_readlane_b32 s66, v63, 16
+; SI-NEXT:    v_readlane_b32 s65, v63, 15
+; SI-NEXT:    v_readlane_b32 s64, v63, 14
+; SI-NEXT:    v_readlane_b32 s55, v63, 13
+; SI-NEXT:    v_readlane_b32 s54, v63, 12
+; SI-NEXT:    v_readlane_b32 s53, v63, 11
+; SI-NEXT:    v_readlane_b32 s52, v63, 10
+; SI-NEXT:    v_readlane_b32 s51, v63, 9
+; SI-NEXT:    v_readlane_b32 s50, v63, 8
+; SI-NEXT:    v_readlane_b32 s49, v63, 7
+; SI-NEXT:    v_readlane_b32 s48, v63, 6
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -215965,17 +215965,17 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v33, s30, 0
-; VI-NEXT:    v_writelane_b32 v33, s31, 1
-; VI-NEXT:    v_writelane_b32 v33, s36, 2
-; VI-NEXT:    v_writelane_b32 v33, s37, 3
-; VI-NEXT:    v_writelane_b32 v33, s38, 4
-; VI-NEXT:    v_writelane_b32 v33, s39, 5
-; VI-NEXT:    v_writelane_b32 v33, s48, 6
-; VI-NEXT:    v_writelane_b32 v33, s49, 7
-; VI-NEXT:    v_writelane_b32 v33, s50, 8
+; VI-NEXT:    v_writelane_b32 v33, s36, 0
+; VI-NEXT:    v_writelane_b32 v33, s37, 1
+; VI-NEXT:    v_writelane_b32 v33, s38, 2
+; VI-NEXT:    v_writelane_b32 v33, s39, 3
+; VI-NEXT:    v_writelane_b32 v33, s48, 4
+; VI-NEXT:    v_writelane_b32 v33, s49, 5
+; VI-NEXT:    v_writelane_b32 v33, s50, 6
+; VI-NEXT:    v_writelane_b32 v33, s51, 7
+; VI-NEXT:    v_writelane_b32 v33, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v33, s51, 9
+; VI-NEXT:    v_writelane_b32 v33, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -216197,16 +216197,16 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB103_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v33, 9
-; VI-NEXT:    v_readlane_b32 s50, v33, 8
-; VI-NEXT:    v_readlane_b32 s49, v33, 7
-; VI-NEXT:    v_readlane_b32 s48, v33, 6
-; VI-NEXT:    v_readlane_b32 s39, v33, 5
-; VI-NEXT:    v_readlane_b32 s38, v33, 4
-; VI-NEXT:    v_readlane_b32 s37, v33, 3
-; VI-NEXT:    v_readlane_b32 s36, v33, 2
-; VI-NEXT:    v_readlane_b32 s31, v33, 1
-; VI-NEXT:    v_readlane_b32 s30, v33, 0
+; VI-NEXT:    v_readlane_b32 s30, v33, 8
+; VI-NEXT:    v_readlane_b32 s31, v33, 9
+; VI-NEXT:    v_readlane_b32 s51, v33, 7
+; VI-NEXT:    v_readlane_b32 s50, v33, 6
+; VI-NEXT:    v_readlane_b32 s49, v33, 5
+; VI-NEXT:    v_readlane_b32 s48, v33, 4
+; VI-NEXT:    v_readlane_b32 s39, v33, 3
+; VI-NEXT:    v_readlane_b32 s38, v33, 2
+; VI-NEXT:    v_readlane_b32 s37, v33, 1
+; VI-NEXT:    v_readlane_b32 s36, v33, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -216219,17 +216219,17 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -216323,16 +216323,16 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB103_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -219821,11 +219821,13 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s30, 4
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
+; SI-NEXT:    v_writelane_b32 v63, s31, 5
 ; SI-NEXT:    s_and_b32 s14, s24, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s15, s24, 16
 ; SI-NEXT:    s_and_b32 s24, s23, 0xffff0000
@@ -219833,7 +219835,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_and_b32 s30, vcc_lo, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s31, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v1
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
 ; SI-NEXT:    s_lshl_b32 s13, s25, 16
 ; SI-NEXT:    v_readfirstlane_b32 s42, v17
 ; SI-NEXT:    v_readfirstlane_b32 s44, v16
@@ -219854,7 +219855,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_lshl_b32 s35, vcc_lo, 16
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v0
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s23
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
@@ -220826,12 +220826,12 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v49
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 4
+; SI-NEXT:    v_readlane_b32 s31, v63, 5
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
@@ -221026,17 +221026,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v35, s30, 0
-; VI-NEXT:    v_writelane_b32 v35, s31, 1
-; VI-NEXT:    v_writelane_b32 v35, s36, 2
-; VI-NEXT:    v_writelane_b32 v35, s37, 3
-; VI-NEXT:    v_writelane_b32 v35, s38, 4
-; VI-NEXT:    v_writelane_b32 v35, s39, 5
-; VI-NEXT:    v_writelane_b32 v35, s48, 6
-; VI-NEXT:    v_writelane_b32 v35, s49, 7
-; VI-NEXT:    v_writelane_b32 v35, s50, 8
+; VI-NEXT:    v_writelane_b32 v35, s36, 0
+; VI-NEXT:    v_writelane_b32 v35, s37, 1
+; VI-NEXT:    v_writelane_b32 v35, s38, 2
+; VI-NEXT:    v_writelane_b32 v35, s39, 3
+; VI-NEXT:    v_writelane_b32 v35, s48, 4
+; VI-NEXT:    v_writelane_b32 v35, s49, 5
+; VI-NEXT:    v_writelane_b32 v35, s50, 6
+; VI-NEXT:    v_writelane_b32 v35, s51, 7
+; VI-NEXT:    v_writelane_b32 v35, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v35, s51, 9
+; VI-NEXT:    v_writelane_b32 v35, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -221689,16 +221689,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s50
 ; VI-NEXT:    v_mov_b32_e32 v31, s51
 ; VI-NEXT:  .LBB105_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v35, 9
-; VI-NEXT:    v_readlane_b32 s50, v35, 8
-; VI-NEXT:    v_readlane_b32 s49, v35, 7
-; VI-NEXT:    v_readlane_b32 s48, v35, 6
-; VI-NEXT:    v_readlane_b32 s39, v35, 5
-; VI-NEXT:    v_readlane_b32 s38, v35, 4
-; VI-NEXT:    v_readlane_b32 s37, v35, 3
-; VI-NEXT:    v_readlane_b32 s36, v35, 2
-; VI-NEXT:    v_readlane_b32 s31, v35, 1
-; VI-NEXT:    v_readlane_b32 s30, v35, 0
+; VI-NEXT:    v_readlane_b32 s30, v35, 8
+; VI-NEXT:    v_readlane_b32 s31, v35, 9
+; VI-NEXT:    v_readlane_b32 s51, v35, 7
+; VI-NEXT:    v_readlane_b32 s50, v35, 6
+; VI-NEXT:    v_readlane_b32 s49, v35, 5
+; VI-NEXT:    v_readlane_b32 s48, v35, 4
+; VI-NEXT:    v_readlane_b32 s39, v35, 3
+; VI-NEXT:    v_readlane_b32 s38, v35, 2
+; VI-NEXT:    v_readlane_b32 s37, v35, 1
+; VI-NEXT:    v_readlane_b32 s36, v35, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -221711,17 +221711,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v36, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v36, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v36, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v36, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v36, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v36, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v36, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v36, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v36, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v36, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v36, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v36, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v36, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v36, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v36, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v36, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v36, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v36, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v36, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v36, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -222360,16 +222360,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v30, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v31, s51
 ; GFX9-NEXT:  .LBB105_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v36, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v36, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v36, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v36, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v36, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v36, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v36, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v36, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v36, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v36, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v36, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v36, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v36, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v36, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v36, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v36, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v36, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v36, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v36, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v36, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -224658,66 +224658,69 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v33, s30, 0
-; SI-NEXT:    v_writelane_b32 v33, s31, 1
-; SI-NEXT:    v_writelane_b32 v33, s34, 2
-; SI-NEXT:    v_writelane_b32 v33, s35, 3
-; SI-NEXT:    v_writelane_b32 v33, s36, 4
-; SI-NEXT:    v_writelane_b32 v33, s37, 5
-; SI-NEXT:    v_writelane_b32 v33, s38, 6
-; SI-NEXT:    v_writelane_b32 v33, s39, 7
-; SI-NEXT:    v_writelane_b32 v33, s48, 8
-; SI-NEXT:    v_writelane_b32 v33, s49, 9
-; SI-NEXT:    v_writelane_b32 v33, s50, 10
-; SI-NEXT:    v_writelane_b32 v33, s51, 11
-; SI-NEXT:    v_writelane_b32 v33, s52, 12
-; SI-NEXT:    v_writelane_b32 v33, s53, 13
-; SI-NEXT:    v_writelane_b32 v33, s54, 14
-; SI-NEXT:    v_writelane_b32 v33, s55, 15
-; SI-NEXT:    v_writelane_b32 v33, s64, 16
-; SI-NEXT:    v_writelane_b32 v33, s65, 17
-; SI-NEXT:    v_writelane_b32 v33, s66, 18
-; SI-NEXT:    v_writelane_b32 v33, s67, 19
+; SI-NEXT:    v_writelane_b32 v33, s34, 0
+; SI-NEXT:    v_writelane_b32 v33, s35, 1
+; SI-NEXT:    v_writelane_b32 v33, s36, 2
+; SI-NEXT:    v_writelane_b32 v33, s37, 3
+; SI-NEXT:    v_writelane_b32 v33, s38, 4
+; SI-NEXT:    v_writelane_b32 v33, s39, 5
+; SI-NEXT:    v_writelane_b32 v33, s48, 6
+; SI-NEXT:    v_writelane_b32 v33, s49, 7
+; SI-NEXT:    v_writelane_b32 v33, s50, 8
+; SI-NEXT:    v_writelane_b32 v33, s51, 9
+; SI-NEXT:    v_writelane_b32 v33, s52, 10
+; SI-NEXT:    v_writelane_b32 v33, s53, 11
+; SI-NEXT:    v_writelane_b32 v33, s54, 12
+; SI-NEXT:    v_writelane_b32 v33, s55, 13
+; SI-NEXT:    v_writelane_b32 v33, s64, 14
+; SI-NEXT:    v_writelane_b32 v33, s65, 15
+; SI-NEXT:    v_writelane_b32 v33, s66, 16
+; SI-NEXT:    v_writelane_b32 v33, s67, 17
+; SI-NEXT:    v_writelane_b32 v33, s68, 18
+; SI-NEXT:    v_writelane_b32 v33, s69, 19
+; SI-NEXT:    v_writelane_b32 v33, s70, 20
+; SI-NEXT:    v_writelane_b32 v33, s71, 21
+; SI-NEXT:    v_writelane_b32 v33, s80, 22
+; SI-NEXT:    v_writelane_b32 v33, s81, 23
+; SI-NEXT:    v_writelane_b32 v33, s82, 24
 ; SI-NEXT:    s_lshr_b32 s5, s16, 16
 ; SI-NEXT:    ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v33, s68, 20
+; SI-NEXT:    v_writelane_b32 v33, s83, 25
 ; SI-NEXT:    s_lshr_b32 s6, s17, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v34, s5, 0
-; SI-NEXT:    v_writelane_b32 v33, s69, 21
+; SI-NEXT:    v_writelane_b32 v33, s84, 26
 ; SI-NEXT:    s_lshr_b32 s8, s18, 16
 ; SI-NEXT:    v_writelane_b32 v34, s6, 1
-; SI-NEXT:    v_writelane_b32 v33, s70, 22
+; SI-NEXT:    v_writelane_b32 v33, s85, 27
 ; SI-NEXT:    s_lshr_b32 s90, s19, 16
 ; SI-NEXT:    v_writelane_b32 v34, s8, 2
-; SI-NEXT:    v_writelane_b32 v33, s71, 23
+; SI-NEXT:    v_writelane_b32 v33, s86, 28
 ; SI-NEXT:    s_lshr_b32 s91, s20, 16
 ; SI-NEXT:    v_writelane_b32 v34, s90, 3
-; SI-NEXT:    v_writelane_b32 v33, s80, 24
+; SI-NEXT:    v_writelane_b32 v33, s87, 29
 ; SI-NEXT:    s_lshr_b32 s92, s21, 16
 ; SI-NEXT:    v_writelane_b32 v34, s91, 4
-; SI-NEXT:    v_writelane_b32 v33, s81, 25
+; SI-NEXT:    v_writelane_b32 v33, s96, 30
 ; SI-NEXT:    s_lshr_b32 s93, s22, 16
 ; SI-NEXT:    v_writelane_b32 v34, s92, 5
-; SI-NEXT:    v_writelane_b32 v33, s82, 26
+; SI-NEXT:    v_writelane_b32 v33, s97, 31
 ; SI-NEXT:    s_lshr_b32 s94, s23, 16
 ; SI-NEXT:    v_writelane_b32 v34, s93, 6
-; SI-NEXT:    v_writelane_b32 v33, s83, 27
+; SI-NEXT:    v_writelane_b32 v33, s98, 32
 ; SI-NEXT:    s_lshr_b32 s95, s24, 16
 ; SI-NEXT:    v_writelane_b32 v34, s94, 7
-; SI-NEXT:    v_writelane_b32 v33, s84, 28
+; SI-NEXT:    v_writelane_b32 v33, s99, 33
 ; SI-NEXT:    s_lshr_b32 vcc_lo, s25, 16
 ; SI-NEXT:    v_writelane_b32 v34, s95, 8
-; SI-NEXT:    v_writelane_b32 v33, s85, 29
+; SI-NEXT:    v_writelane_b32 v33, s30, 34
 ; SI-NEXT:    s_lshr_b32 vcc_hi, s26, 16
 ; SI-NEXT:    v_writelane_b32 v34, vcc_lo, 9
-; SI-NEXT:    v_writelane_b32 v33, s86, 30
+; SI-NEXT:    v_writelane_b32 v33, s31, 35
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
 ; SI-NEXT:    v_writelane_b32 v34, vcc_hi, 10
-; SI-NEXT:    v_writelane_b32 v33, s87, 31
 ; SI-NEXT:    s_lshr_b32 s31, s28, 16
 ; SI-NEXT:    v_writelane_b32 v34, s30, 11
-; SI-NEXT:    v_writelane_b32 v33, s96, 32
 ; SI-NEXT:    s_lshr_b32 s34, s29, 16
 ; SI-NEXT:    v_readfirstlane_b32 s89, v17
 ; SI-NEXT:    v_readfirstlane_b32 s79, v16
@@ -224738,7 +224741,6 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s9, v1
 ; SI-NEXT:    v_readfirstlane_b32 s7, v0
 ; SI-NEXT:    v_writelane_b32 v34, s31, 12
-; SI-NEXT:    v_writelane_b32 v33, s97, 33
 ; SI-NEXT:    s_lshr_b32 s88, s89, 16
 ; SI-NEXT:    s_lshr_b32 s78, s79, 16
 ; SI-NEXT:    s_lshr_b32 s76, s77, 16
@@ -224759,10 +224761,8 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_lshr_b32 s35, s7, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v18
 ; SI-NEXT:    v_writelane_b32 v34, s34, 13
-; SI-NEXT:    v_writelane_b32 v33, s98, 34
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_writelane_b32 v34, s35, 14
-; SI-NEXT:    v_writelane_b32 v33, s99, 35
 ; SI-NEXT:    v_writelane_b32 v34, s36, 15
 ; SI-NEXT:    s_cbranch_scc0 .LBB107_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -225379,43 +225379,43 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s91
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT:    v_readlane_b32 s30, v33, 34
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT:    v_readlane_b32 s99, v33, 35
-; SI-NEXT:    v_readlane_b32 s98, v33, 34
-; SI-NEXT:    v_readlane_b32 s97, v33, 33
-; SI-NEXT:    v_readlane_b32 s96, v33, 32
-; SI-NEXT:    v_readlane_b32 s87, v33, 31
-; SI-NEXT:    v_readlane_b32 s86, v33, 30
-; SI-NEXT:    v_readlane_b32 s85, v33, 29
-; SI-NEXT:    v_readlane_b32 s84, v33, 28
-; SI-NEXT:    v_readlane_b32 s83, v33, 27
-; SI-NEXT:    v_readlane_b32 s82, v33, 26
-; SI-NEXT:    v_readlane_b32 s81, v33, 25
-; SI-NEXT:    v_readlane_b32 s80, v33, 24
-; SI-NEXT:    v_readlane_b32 s71, v33, 23
-; SI-NEXT:    v_readlane_b32 s70, v33, 22
-; SI-NEXT:    v_readlane_b32 s69, v33, 21
-; SI-NEXT:    v_readlane_b32 s68, v33, 20
-; SI-NEXT:    v_readlane_b32 s67, v33, 19
-; SI-NEXT:    v_readlane_b32 s66, v33, 18
-; SI-NEXT:    v_readlane_b32 s65, v33, 17
-; SI-NEXT:    v_readlane_b32 s64, v33, 16
-; SI-NEXT:    v_readlane_b32 s55, v33, 15
-; SI-NEXT:    v_readlane_b32 s54, v33, 14
-; SI-NEXT:    v_readlane_b32 s53, v33, 13
-; SI-NEXT:    v_readlane_b32 s52, v33, 12
-; SI-NEXT:    v_readlane_b32 s51, v33, 11
-; SI-NEXT:    v_readlane_b32 s50, v33, 10
-; SI-NEXT:    v_readlane_b32 s49, v33, 9
-; SI-NEXT:    v_readlane_b32 s48, v33, 8
-; SI-NEXT:    v_readlane_b32 s39, v33, 7
-; SI-NEXT:    v_readlane_b32 s38, v33, 6
-; SI-NEXT:    v_readlane_b32 s37, v33, 5
-; SI-NEXT:    v_readlane_b32 s36, v33, 4
-; SI-NEXT:    v_readlane_b32 s35, v33, 3
-; SI-NEXT:    v_readlane_b32 s34, v33, 2
-; SI-NEXT:    v_readlane_b32 s31, v33, 1
-; SI-NEXT:    v_readlane_b32 s30, v33, 0
+; SI-NEXT:    v_readlane_b32 s31, v33, 35
+; SI-NEXT:    v_readlane_b32 s99, v33, 33
+; SI-NEXT:    v_readlane_b32 s98, v33, 32
+; SI-NEXT:    v_readlane_b32 s97, v33, 31
+; SI-NEXT:    v_readlane_b32 s96, v33, 30
+; SI-NEXT:    v_readlane_b32 s87, v33, 29
+; SI-NEXT:    v_readlane_b32 s86, v33, 28
+; SI-NEXT:    v_readlane_b32 s85, v33, 27
+; SI-NEXT:    v_readlane_b32 s84, v33, 26
+; SI-NEXT:    v_readlane_b32 s83, v33, 25
+; SI-NEXT:    v_readlane_b32 s82, v33, 24
+; SI-NEXT:    v_readlane_b32 s81, v33, 23
+; SI-NEXT:    v_readlane_b32 s80, v33, 22
+; SI-NEXT:    v_readlane_b32 s71, v33, 21
+; SI-NEXT:    v_readlane_b32 s70, v33, 20
+; SI-NEXT:    v_readlane_b32 s69, v33, 19
+; SI-NEXT:    v_readlane_b32 s68, v33, 18
+; SI-NEXT:    v_readlane_b32 s67, v33, 17
+; SI-NEXT:    v_readlane_b32 s66, v33, 16
+; SI-NEXT:    v_readlane_b32 s65, v33, 15
+; SI-NEXT:    v_readlane_b32 s64, v33, 14
+; SI-NEXT:    v_readlane_b32 s55, v33, 13
+; SI-NEXT:    v_readlane_b32 s54, v33, 12
+; SI-NEXT:    v_readlane_b32 s53, v33, 11
+; SI-NEXT:    v_readlane_b32 s52, v33, 10
+; SI-NEXT:    v_readlane_b32 s51, v33, 9
+; SI-NEXT:    v_readlane_b32 s50, v33, 8
+; SI-NEXT:    v_readlane_b32 s49, v33, 7
+; SI-NEXT:    v_readlane_b32 s48, v33, 6
+; SI-NEXT:    v_readlane_b32 s39, v33, 5
+; SI-NEXT:    v_readlane_b32 s38, v33, 4
+; SI-NEXT:    v_readlane_b32 s37, v33, 3
+; SI-NEXT:    v_readlane_b32 s36, v33, 2
+; SI-NEXT:    v_readlane_b32 s35, v33, 1
+; SI-NEXT:    v_readlane_b32 s34, v33, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -225429,14 +225429,14 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
+; VI-NEXT:    v_writelane_b32 v32, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s8, v17
 ; VI-NEXT:    v_readfirstlane_b32 s9, v16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v15
@@ -225456,7 +225456,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s6, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
+; VI-NEXT:    v_writelane_b32 v32, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB107_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB107_3
@@ -225622,6 +225622,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; VI-NEXT:    s_add_i32 s46, s46, 0x30000
 ; VI-NEXT:    s_add_i32 s47, s4, 0x30000
 ; VI-NEXT:  .LBB107_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v32, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s16
 ; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    v_mov_b32_e32 v2, s18
@@ -225654,14 +225655,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v29, s10
 ; VI-NEXT:    v_mov_b32_e32 v30, s9
 ; VI-NEXT:    v_mov_b32_e32 v31, s8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 7
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -225676,17 +225676,17 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -225779,16 +225779,16 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB107_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -226642,13 +226642,14 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s36, 2
+; SI-NEXT:    v_writelane_b32 v63, s37, 3
+; SI-NEXT:    v_writelane_b32 v63, s38, 4
+; SI-NEXT:    v_writelane_b32 v63, s39, 5
+; SI-NEXT:    v_writelane_b32 v63, s30, 6
+; SI-NEXT:    v_writelane_b32 v63, s31, 7
 ; SI-NEXT:    v_readfirstlane_b32 s10, v17
 ; SI-NEXT:    v_readfirstlane_b32 s6, v16
 ; SI-NEXT:    v_readfirstlane_b32 s11, v15
@@ -226667,7 +226668,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
 ; SI-NEXT:    s_lshr_b32 s43, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s42, s27, 16
@@ -227187,14 +227187,14 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 6
+; SI-NEXT:    v_readlane_b32 s31, v63, 7
+; SI-NEXT:    v_readlane_b32 s39, v63, 5
+; SI-NEXT:    v_readlane_b32 s38, v63, 4
+; SI-NEXT:    v_readlane_b32 s37, v63, 3
+; SI-NEXT:    v_readlane_b32 s36, v63, 2
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
@@ -227298,17 +227298,17 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v33, s30, 0
-; VI-NEXT:    v_writelane_b32 v33, s31, 1
-; VI-NEXT:    v_writelane_b32 v33, s36, 2
-; VI-NEXT:    v_writelane_b32 v33, s37, 3
-; VI-NEXT:    v_writelane_b32 v33, s38, 4
-; VI-NEXT:    v_writelane_b32 v33, s39, 5
-; VI-NEXT:    v_writelane_b32 v33, s48, 6
-; VI-NEXT:    v_writelane_b32 v33, s49, 7
-; VI-NEXT:    v_writelane_b32 v33, s50, 8
+; VI-NEXT:    v_writelane_b32 v33, s36, 0
+; VI-NEXT:    v_writelane_b32 v33, s37, 1
+; VI-NEXT:    v_writelane_b32 v33, s38, 2
+; VI-NEXT:    v_writelane_b32 v33, s39, 3
+; VI-NEXT:    v_writelane_b32 v33, s48, 4
+; VI-NEXT:    v_writelane_b32 v33, s49, 5
+; VI-NEXT:    v_writelane_b32 v33, s50, 6
+; VI-NEXT:    v_writelane_b32 v33, s51, 7
+; VI-NEXT:    v_writelane_b32 v33, s30, 8
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v33, s51, 9
+; VI-NEXT:    v_writelane_b32 v33, s31, 9
 ; VI-NEXT:    v_readfirstlane_b32 s51, v17
 ; VI-NEXT:    v_readfirstlane_b32 s50, v16
 ; VI-NEXT:    v_readfirstlane_b32 s49, v15
@@ -227530,16 +227530,16 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB109_5: ; %end
-; VI-NEXT:    v_readlane_b32 s51, v33, 9
-; VI-NEXT:    v_readlane_b32 s50, v33, 8
-; VI-NEXT:    v_readlane_b32 s49, v33, 7
-; VI-NEXT:    v_readlane_b32 s48, v33, 6
-; VI-NEXT:    v_readlane_b32 s39, v33, 5
-; VI-NEXT:    v_readlane_b32 s38, v33, 4
-; VI-NEXT:    v_readlane_b32 s37, v33, 3
-; VI-NEXT:    v_readlane_b32 s36, v33, 2
-; VI-NEXT:    v_readlane_b32 s31, v33, 1
-; VI-NEXT:    v_readlane_b32 s30, v33, 0
+; VI-NEXT:    v_readlane_b32 s30, v33, 8
+; VI-NEXT:    v_readlane_b32 s31, v33, 9
+; VI-NEXT:    v_readlane_b32 s51, v33, 7
+; VI-NEXT:    v_readlane_b32 s50, v33, 6
+; VI-NEXT:    v_readlane_b32 s49, v33, 5
+; VI-NEXT:    v_readlane_b32 s48, v33, 4
+; VI-NEXT:    v_readlane_b32 s39, v33, 3
+; VI-NEXT:    v_readlane_b32 s38, v33, 2
+; VI-NEXT:    v_readlane_b32 s37, v33, 1
+; VI-NEXT:    v_readlane_b32 s36, v33, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -227552,17 +227552,17 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -227656,16 +227656,16 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB109_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -228992,90 +228992,91 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
 ; SI-NEXT:    s_lshr_b32 s4, s27, 16
 ; SI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v33, s4, 0
 ; SI-NEXT:    s_lshr_b32 s4, s25, 16
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
 ; SI-NEXT:    v_writelane_b32 v33, s4, 1
 ; SI-NEXT:    s_lshr_b32 s4, s24, 16
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
 ; SI-NEXT:    v_writelane_b32 v33, s4, 2
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
 ; SI-NEXT:    v_writelane_b32 v33, s23, 3
 ; SI-NEXT:    s_lshr_b32 s4, s23, 16
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
 ; SI-NEXT:    v_writelane_b32 v33, s4, 4
 ; SI-NEXT:    s_lshr_b32 s4, s21, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
 ; SI-NEXT:    v_writelane_b32 v33, s4, 5
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
 ; SI-NEXT:    v_writelane_b32 v33, s19, 6
 ; SI-NEXT:    s_lshr_b32 s4, s19, 16
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
 ; SI-NEXT:    v_writelane_b32 v33, s4, 7
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
 ; SI-NEXT:    v_writelane_b32 v33, s17, 8
 ; SI-NEXT:    s_lshr_b32 s4, s17, 16
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
 ; SI-NEXT:    v_writelane_b32 v33, s4, 9
 ; SI-NEXT:    s_lshr_b32 s4, s16, 16
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
 ; SI-NEXT:    s_mov_b32 s46, s16
 ; SI-NEXT:    v_writelane_b32 v33, s4, 10
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
 ; SI-NEXT:    s_mov_b32 s47, s18
 ; SI-NEXT:    v_writelane_b32 v33, s46, 11
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
 ; SI-NEXT:    s_mov_b32 s57, s20
 ; SI-NEXT:    v_writelane_b32 v33, s47, 12
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    s_mov_b32 s31, s22
 ; SI-NEXT:    v_writelane_b32 v33, s57, 13
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
 ; SI-NEXT:    s_mov_b32 s35, s24
 ; SI-NEXT:    v_writelane_b32 v33, s31, 14
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
 ; SI-NEXT:    s_mov_b32 s69, s26
 ; SI-NEXT:    v_writelane_b32 v33, s35, 15
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
 ; SI-NEXT:    s_mov_b32 s78, s28
 ; SI-NEXT:    v_writelane_b32 v33, s69, 16
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
 ; SI-NEXT:    v_readfirstlane_b32 s77, v10
 ; SI-NEXT:    v_readfirstlane_b32 s45, v0
 ; SI-NEXT:    v_writelane_b32 v33, s78, 17
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
 ; SI-NEXT:    s_lshr_b32 s34, s18, 16
 ; SI-NEXT:    s_lshr_b32 s18, s77, 16
 ; SI-NEXT:    v_writelane_b32 v33, s45, 18
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
 ; SI-NEXT:    v_readfirstlane_b32 s19, v13
 ; SI-NEXT:    v_readfirstlane_b32 s23, v12
 ; SI-NEXT:    v_writelane_b32 v33, s18, 19
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
 ; SI-NEXT:    s_lshr_b32 s36, s20, 16
 ; SI-NEXT:    s_lshr_b32 s20, s23, 16
 ; SI-NEXT:    v_writelane_b32 v33, s19, 20
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
 ; SI-NEXT:    v_readfirstlane_b32 s58, v15
 ; SI-NEXT:    v_readfirstlane_b32 s88, v14
 ; SI-NEXT:    v_writelane_b32 v33, s20, 21
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
 ; SI-NEXT:    s_mov_b32 s39, s27
 ; SI-NEXT:    v_readfirstlane_b32 s60, v17
 ; SI-NEXT:    v_readfirstlane_b32 s92, v16
@@ -229091,7 +229092,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s71, v2
 ; SI-NEXT:    v_readfirstlane_b32 s68, v1
 ; SI-NEXT:    v_writelane_b32 v33, s58, 22
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s93, s29, 16
 ; SI-NEXT:    s_lshr_b32 s85, s28, 16
 ; SI-NEXT:    s_lshr_b32 s84, s26, 16
@@ -229669,6 +229669,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s46, s99, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s46
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
 ; SI-NEXT:    v_readlane_b32 s47, v33, 33
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
@@ -229702,42 +229703,41 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v29, s7
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
 ; SI-NEXT:    v_mov_b32_e32 v31, s5
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -229793,14 +229793,14 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v18
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
+; VI-NEXT:    v_writelane_b32 v32, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s8, v17
 ; VI-NEXT:    v_readfirstlane_b32 s9, v16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v15
@@ -229820,7 +229820,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s6, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
+; VI-NEXT:    v_writelane_b32 v32, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB111_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB111_3
@@ -229986,6 +229986,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; VI-NEXT:    s_add_i32 s46, s46, 0x30000
 ; VI-NEXT:    s_add_i32 s47, s4, 0x30000
 ; VI-NEXT:  .LBB111_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v32, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s16
 ; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    v_mov_b32_e32 v2, s18
@@ -230018,14 +230019,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v29, s10
 ; VI-NEXT:    v_mov_b32_e32 v30, s9
 ; VI-NEXT:    v_mov_b32_e32 v31, s8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s31, v32, 7
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -230040,17 +230040,17 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v32, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v32, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v32, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v32, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v32, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v32, s48, 6
-; GFX9-NEXT:    v_writelane_b32 v32, s49, 7
-; GFX9-NEXT:    v_writelane_b32 v32, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v32, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v32, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v32, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v32, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v32, s48, 4
+; GFX9-NEXT:    v_writelane_b32 v32, s49, 5
+; GFX9-NEXT:    v_writelane_b32 v32, s50, 6
+; GFX9-NEXT:    v_writelane_b32 v32, s51, 7
+; GFX9-NEXT:    v_writelane_b32 v32, s30, 8
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX9-NEXT:    v_writelane_b32 v32, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v32, s31, 9
 ; GFX9-NEXT:    v_readfirstlane_b32 s51, v17
 ; GFX9-NEXT:    v_readfirstlane_b32 s50, v16
 ; GFX9-NEXT:    v_readfirstlane_b32 s49, v15
@@ -230143,16 +230143,16 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB111_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s51, v32, 9
-; GFX9-NEXT:    v_readlane_b32 s50, v32, 8
-; GFX9-NEXT:    v_readlane_b32 s49, v32, 7
-; GFX9-NEXT:    v_readlane_b32 s48, v32, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v32, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v32, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v32, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v32, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v32, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v32, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v32, 9
+; GFX9-NEXT:    v_readlane_b32 s51, v32, 7
+; GFX9-NEXT:    v_readlane_b32 s50, v32, 6
+; GFX9-NEXT:    v_readlane_b32 s49, v32, 5
+; GFX9-NEXT:    v_readlane_b32 s48, v32, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v32, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v32, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v32, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v32, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 8b971fe4c291d..7e231f48de12a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) {
+define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -88,7 +88,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v4i32_to_v4f32_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v4i32_to_v4f32_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -190,7 +190,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) {
+define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -270,7 +270,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v4f32_to_v4i32_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v4f32_to_v4i32_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,7 +375,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) {
+define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -457,7 +457,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v4i32_to_v2i64_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v4i32_to_v2i64_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -559,7 +559,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) {
+define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -642,7 +642,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v2i64_to_v4i32_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v2i64_to_v4i32_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -744,7 +744,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) {
+define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,7 +826,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v4i32_to_v2f64_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v4i32_to_v2f64_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -928,7 +928,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) {
+define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1005,7 +1005,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v2f64_to_v4i32_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v2f64_to_v4i32_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1102,7 +1102,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) {
+define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1210,7 +1210,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1336,7 +1336,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) {
+define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1477,7 +1477,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1629,7 +1629,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) {
+define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1737,7 +1737,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1863,7 +1863,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) {
+define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2021,7 +2021,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2189,7 +2189,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2318,7 +2318,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2456,7 +2456,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
+define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2882,7 +2882,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3351,7 +3351,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) {
+define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3677,7 +3677,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i32_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3965,7 +3965,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
+define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4490,7 +4490,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v4i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4853,7 +4853,7 @@ end:
   ret <4 x i32> %phi
 }
 
-define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) {
+define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4933,7 +4933,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v4f32_to_v2i64_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v4f32_to_v2i64_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5038,7 +5038,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) {
+define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5121,7 +5121,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v2i64_to_v4f32_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v2i64_to_v4f32_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5223,7 +5223,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) {
+define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5303,7 +5303,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v4f32_to_v2f64_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v4f32_to_v2f64_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5408,7 +5408,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) {
+define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5485,7 +5485,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v2f64_to_v4f32_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v2f64_to_v4f32_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5582,7 +5582,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) {
+define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5688,7 +5688,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5822,7 +5822,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) {
+define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5963,7 +5963,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6115,7 +6115,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) {
+define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6221,7 +6221,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6355,7 +6355,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) {
+define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6513,7 +6513,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6681,7 +6681,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6808,7 +6808,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6958,7 +6958,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
+define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7384,7 +7384,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7853,7 +7853,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) {
+define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8175,7 +8175,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f32_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8492,7 +8492,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
+define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9017,7 +9017,7 @@ end:
   ret <4 x float> %phi
 }
 
-define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v4f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9380,7 +9380,7 @@ end:
   ret <4 x float> %phi
 }
 
-define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) {
+define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9463,7 +9463,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v2i64_to_v2f64_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v2i64_to_v2f64_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9564,7 +9564,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) {
+define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9641,7 +9641,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v2f64_to_v2i64_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v2f64_to_v2i64_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9738,7 +9738,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) {
+define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9847,7 +9847,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9973,7 +9973,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) {
+define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10114,7 +10114,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10266,7 +10266,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) {
+define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10375,7 +10375,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10501,7 +10501,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) {
+define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10659,7 +10659,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10827,7 +10827,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10957,7 +10957,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11095,7 +11095,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
+define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11521,7 +11521,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11990,7 +11990,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) {
+define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12316,7 +12316,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i64_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12604,7 +12604,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
+define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13129,7 +13129,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v2i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13492,7 +13492,7 @@ end:
   ret <2 x i64> %phi
 }
 
-define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) {
+define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13595,7 +13595,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13721,7 +13721,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) {
+define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13862,7 +13862,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14014,7 +14014,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) {
+define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14117,7 +14117,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14243,7 +14243,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) {
+define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14401,7 +14401,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14569,7 +14569,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14691,7 +14691,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14833,7 +14833,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
+define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15259,7 +15259,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15728,7 +15728,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) {
+define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16046,7 +16046,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f64_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16354,7 +16354,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
+define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16879,7 +16879,7 @@ end:
   ret <2 x double> %phi
 }
 
-define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v2f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17242,7 +17242,7 @@ end:
   ret <2 x double> %phi
 }
 
-define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) {
+define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17399,7 +17399,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17574,7 +17574,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) {
+define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17714,7 +17714,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17888,7 +17888,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18040,7 +18040,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18215,7 +18215,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
+define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18650,7 +18650,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19123,7 +19123,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) {
+define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19503,7 +19503,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i16_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19852,7 +19852,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
+define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20398,7 +20398,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20788,7 +20788,7 @@ end:
   ret <8 x i16> %phi
 }
 
-define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20956,7 +20956,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21148,7 +21148,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
+define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21597,7 +21597,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22095,7 +22095,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) {
+define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22479,7 +22479,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f16_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22848,7 +22848,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
+define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23394,7 +23394,7 @@ end:
   ret <8 x half> %phi
 }
 
-define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23784,7 +23784,7 @@ end:
   ret <8 x half> %phi
 }
 
-define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
+define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v16i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24443,7 +24443,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8bf16_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25151,7 +25151,7 @@ end:
   ret <16 x i8> %phi
 }
 
-define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25699,7 +25699,7 @@ end:
   ret <8 x bfloat> %phi
 }
 
-define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i8_to_v8bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26084,3 +26084,5 @@ end:
   %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <8 x bfloat> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
index 430a93d9e9bf0..c09389ef700ac 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) {
+define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v5f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -92,7 +92,7 @@ end:
   ret <5 x float> %phi
 }
 
-define inreg <5 x float> @bitcast_v5i32_to_v5f32_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v5i32_to_v5f32_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v5f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,7 +202,7 @@ end:
   ret <5 x float> %phi
 }
 
-define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) {
+define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v5i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -286,7 +286,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define inreg <5 x i32> @bitcast_v5f32_to_v5i32_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v5f32_to_v5i32_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v5i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -400,7 +400,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) {
+define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v10i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -519,7 +519,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v10i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -659,7 +659,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) {
+define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v5i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -817,7 +817,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v5i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -990,7 +990,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) {
+define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v10f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1109,7 +1109,7 @@ end:
   ret <10 x half> %phi
 }
 
-define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i32_to_v10f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1249,7 +1249,7 @@ end:
   ret <10 x half> %phi
 }
 
-define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) {
+define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v5i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1427,7 +1427,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v5i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1619,7 +1619,7 @@ end:
   ret <5 x i32> %phi
 }
 
-define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) {
+define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v10i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1736,7 +1736,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v10i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1893,7 +1893,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) {
+define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v5f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2051,7 +2051,7 @@ end:
   ret <5 x float> %phi
 }
 
-define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v5f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2224,7 +2224,7 @@ end:
   ret <5 x float> %phi
 }
 
-define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) {
+define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v10f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2341,7 +2341,7 @@ end:
   ret <10 x half> %phi
 }
 
-define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f32_to_v10f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2498,7 +2498,7 @@ end:
   ret <10 x half> %phi
 }
 
-define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) {
+define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v5f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2676,7 +2676,7 @@ end:
   ret <5 x float> %phi
 }
 
-define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v5f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2868,7 +2868,7 @@ end:
   ret <5 x float> %phi
 }
 
-define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) {
+define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v10f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3045,7 +3045,7 @@ end:
   ret <10 x half> %phi
 }
 
-define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i16_to_v10f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3250,7 +3250,7 @@ end:
   ret <10 x half> %phi
 }
 
-define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) {
+define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v10i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3407,7 +3407,7 @@ end:
   ret <10 x i16> %phi
 }
 
-define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f16_to_v10i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3612,6 +3612,9 @@ end:
   %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <10 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
index dd9fa1456e37a..30193251552ae 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define half @bitcast_i16_to_f16(i16 %a, i32 %b) {
+define half @bitcast_i16_to_f16(i16 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i16_to_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,7 +99,7 @@ end:
   ret half %phi
 }
 
-define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) {
+define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i16_to_f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -181,7 +181,7 @@ end:
   ret half %phi
 }
 
-define i16 @bitcast_f16_to_i16(half %a, i32 %b) {
+define i16 @bitcast_f16_to_i16(half %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f16_to_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -287,7 +287,7 @@ end:
   ret i16 %phi
 }
 
-define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) {
+define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f16_to_i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -394,7 +394,7 @@ end:
   ret i16 %phi
 }
 
-define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) {
+define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i16_to_bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -490,7 +490,7 @@ end:
   ret bfloat %phi
 }
 
-define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) {
+define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i16_to_bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -576,7 +576,7 @@ end:
   ret bfloat %phi
 }
 
-define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
+define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_bf16_to_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -719,7 +719,7 @@ end:
   ret i16 %phi
 }
 
-define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) {
+define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_bf16_to_i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -835,7 +835,7 @@ end:
   ret i16 %phi
 }
 
-define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) {
+define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f16_to_bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -938,7 +938,7 @@ end:
   ret bfloat %phi
 }
 
-define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) {
+define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f16_to_bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1049,7 +1049,7 @@ end:
   ret bfloat %phi
 }
 
-define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
+define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_bf16_to_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1192,7 +1192,7 @@ end:
   ret half %phi
 }
 
-define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) {
+define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_bf16_to_f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1307,3 +1307,5 @@ end:
   %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret half %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
index d463b115d1088..647b212d4d0bf 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) {
+define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -96,7 +96,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v6i32_to_v6f32_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v6i32_to_v6f32_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -213,7 +213,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) {
+define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -300,7 +300,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v6f32_to_v6i32_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v6f32_to_v6i32_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -422,7 +422,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) {
+define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -512,7 +512,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v6i32_to_v3i64_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v6i32_to_v3i64_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -629,7 +629,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) {
+define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -721,7 +721,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v3i64_to_v6i32_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v3i64_to_v6i32_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -838,7 +838,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) {
+define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -928,7 +928,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v6i32_to_v3f64_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v6i32_to_v3f64_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1045,7 +1045,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) {
+define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1126,7 +1126,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v3f64_to_v6i32_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v3f64_to_v6i32_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1236,7 +1236,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) {
+define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1365,7 +1365,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1518,7 +1518,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) {
+define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1692,7 +1692,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1885,7 +1885,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) {
+define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2014,7 +2014,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i32_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2167,7 +2167,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) {
+define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v6i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2365,7 +2365,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v6i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2580,7 +2580,7 @@ end:
   ret <6 x i32> %phi
 }
 
-define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) {
+define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2667,7 +2667,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v6f32_to_v3i64_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v6f32_to_v3i64_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2789,7 +2789,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) {
+define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2881,7 +2881,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v3i64_to_v6f32_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v3i64_to_v6f32_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2998,7 +2998,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) {
+define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3085,7 +3085,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v6f32_to_v3f64_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v6f32_to_v3f64_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3207,7 +3207,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) {
+define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3288,7 +3288,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v3f64_to_v6f32_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v3f64_to_v6f32_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3398,7 +3398,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) {
+define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3524,7 +3524,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3694,7 +3694,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) {
+define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3868,7 +3868,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4061,7 +4061,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) {
+define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4187,7 +4187,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f32_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4357,7 +4357,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) {
+define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v6f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4555,7 +4555,7 @@ end:
   ret <6 x float> %phi
 }
 
-define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v6f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4770,7 +4770,7 @@ end:
   ret <6 x float> %phi
 }
 
-define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) {
+define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4862,7 +4862,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v3i64_to_v3f64_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v3i64_to_v3f64_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4978,7 +4978,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) {
+define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5059,7 +5059,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v3f64_to_v3i64_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v3f64_to_v3i64_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5169,7 +5169,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) {
+define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5300,7 +5300,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5453,7 +5453,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) {
+define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5627,7 +5627,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5820,7 +5820,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) {
+define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5951,7 +5951,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i64_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6104,7 +6104,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) {
+define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v3i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6302,7 +6302,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v3i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6517,7 +6517,7 @@ end:
   ret <3 x i64> %phi
 }
 
-define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) {
+define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6637,7 +6637,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6795,7 +6795,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) {
+define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6969,7 +6969,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7162,7 +7162,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) {
+define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7282,7 +7282,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f64_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7440,7 +7440,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) {
+define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v3f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7638,7 +7638,7 @@ end:
   ret <3 x double> %phi
 }
 
-define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v3f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7853,7 +7853,7 @@ end:
   ret <3 x double> %phi
 }
 
-define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) {
+define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v12f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8052,7 +8052,7 @@ end:
   ret <12 x half> %phi
 }
 
-define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i16_to_v12f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8283,7 +8283,7 @@ end:
   ret <12 x half> %phi
 }
 
-define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) {
+define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v12i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8458,7 +8458,7 @@ end:
   ret <12 x i16> %phi
 }
 
-define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f16_to_v12i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8686,6 +8686,9 @@ end:
   %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <12 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
index e0fac42ac9d77..c66f5ea8e6c60 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) {
+define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v7f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -101,7 +101,7 @@ end:
   ret <7 x float> %phi
 }
 
-define inreg <7 x float> @bitcast_v7i32_to_v7f32_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v7i32_to_v7f32_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v7f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -226,7 +226,7 @@ end:
   ret <7 x float> %phi
 }
 
-define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) {
+define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v7i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -317,7 +317,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define inreg <7 x i32> @bitcast_v7f32_to_v7i32_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v7f32_to_v7i32_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v7i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -448,7 +448,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) {
+define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v14i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -588,7 +588,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v14i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -755,7 +755,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) {
+define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v7i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -946,7 +946,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v7i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1160,7 +1160,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) {
+define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v14f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1300,7 +1300,7 @@ end:
   ret <14 x half> %phi
 }
 
-define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i32_to_v14f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1467,7 +1467,7 @@ end:
   ret <14 x half> %phi
 }
 
-define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) {
+define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v7i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1686,7 +1686,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v7i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1925,7 +1925,7 @@ end:
   ret <7 x i32> %phi
 }
 
-define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) {
+define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v14i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2061,7 +2061,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v14i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2244,7 +2244,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) {
+define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v7f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2435,7 +2435,7 @@ end:
   ret <7 x float> %phi
 }
 
-define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v7f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2649,7 +2649,7 @@ end:
   ret <7 x float> %phi
 }
 
-define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) {
+define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v14f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2785,7 +2785,7 @@ end:
   ret <14 x half> %phi
 }
 
-define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f32_to_v14f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2968,7 +2968,7 @@ end:
   ret <14 x half> %phi
 }
 
-define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) {
+define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v7f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3187,7 +3187,7 @@ end:
   ret <7 x float> %phi
 }
 
-define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v7f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3426,7 +3426,7 @@ end:
   ret <7 x float> %phi
 }
 
-define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) {
+define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v14f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3646,7 +3646,7 @@ end:
   ret <14 x half> %phi
 }
 
-define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i16_to_v14f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3901,7 +3901,7 @@ end:
   ret <14 x half> %phi
 }
 
-define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) {
+define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v14i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4094,7 +4094,7 @@ end:
   ret <14 x i16> %phi
 }
 
-define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f16_to_v14i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4344,6 +4344,9 @@ end:
   %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <14 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 2123bed663ebb..b64daf30bb761 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) {
+define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -105,7 +105,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v8i32_to_v8f32_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v8i32_to_v8f32_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -237,7 +237,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) {
+define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -331,7 +331,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v8f32_to_v8i32_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v8f32_to_v8i32_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -470,7 +470,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) {
+define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -569,7 +569,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v8i32_to_v4i64_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v8i32_to_v4i64_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -701,7 +701,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) {
+define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -802,7 +802,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v4i64_to_v8i32_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v4i64_to_v8i32_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -934,7 +934,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) {
+define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1033,7 +1033,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v8i32_to_v4f64_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v8i32_to_v4f64_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1165,7 +1165,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) {
+define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1250,7 +1250,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v4f64_to_v8i32_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v4f64_to_v8i32_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1373,7 +1373,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) {
+define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1523,7 +1523,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1703,7 +1703,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) {
+define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1910,7 +1910,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2144,7 +2144,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) {
+define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2294,7 +2294,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2474,7 +2474,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) {
+define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2713,7 +2713,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2975,7 +2975,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3166,7 +3166,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3370,7 +3370,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
+define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4114,7 +4114,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4966,7 +4966,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) {
+define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5511,7 +5511,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i32_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6015,7 +6015,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
+define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6959,7 +6959,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v8i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7650,7 +7650,7 @@ end:
   ret <8 x i32> %phi
 }
 
-define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) {
+define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7744,7 +7744,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v8f32_to_v4i64_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v8f32_to_v4i64_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7883,7 +7883,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) {
+define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7984,7 +7984,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v4i64_to_v8f32_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v4i64_to_v8f32_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8116,7 +8116,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) {
+define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8210,7 +8210,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v8f32_to_v4f64_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v8f32_to_v4f64_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8349,7 +8349,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) {
+define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8434,7 +8434,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v4f64_to_v8f32_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v4f64_to_v8f32_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8557,7 +8557,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) {
+define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8702,7 +8702,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8898,7 +8898,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) {
+define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9105,7 +9105,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9339,7 +9339,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) {
+define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9484,7 +9484,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9680,7 +9680,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) {
+define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9919,7 +9919,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10181,7 +10181,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10367,7 +10367,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10595,7 +10595,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
+define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11339,7 +11339,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12191,7 +12191,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) {
+define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12732,7 +12732,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f32_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13283,7 +13283,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
+define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14227,7 +14227,7 @@ end:
   ret <8 x float> %phi
 }
 
-define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v8f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14918,7 +14918,7 @@ end:
   ret <8 x float> %phi
 }
 
-define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) {
+define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15019,7 +15019,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v4i64_to_v4f64_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v4i64_to_v4f64_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15150,7 +15150,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) {
+define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15235,7 +15235,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v4f64_to_v4i64_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v4f64_to_v4i64_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15358,7 +15358,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) {
+define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15510,7 +15510,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15690,7 +15690,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) {
+define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15897,7 +15897,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16131,7 +16131,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) {
+define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16283,7 +16283,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16463,7 +16463,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) {
+define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16702,7 +16702,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16964,7 +16964,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17157,7 +17157,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17361,7 +17361,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
+define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18105,7 +18105,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18957,7 +18957,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) {
+define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19508,7 +19508,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i64_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20012,7 +20012,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
+define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20956,7 +20956,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v4i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21647,7 +21647,7 @@ end:
   ret <4 x i64> %phi
 }
 
-define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) {
+define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21783,7 +21783,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21963,7 +21963,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) {
+define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22170,7 +22170,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22404,7 +22404,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) {
+define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22540,7 +22540,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22720,7 +22720,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) {
+define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22959,7 +22959,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23221,7 +23221,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23394,7 +23394,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23606,7 +23606,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
+define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24350,7 +24350,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25202,7 +25202,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) {
+define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25737,7 +25737,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f64_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26274,7 +26274,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
+define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27218,7 +27218,7 @@ end:
   ret <4 x double> %phi
 }
 
-define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v4f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27909,7 +27909,7 @@ end:
   ret <4 x double> %phi
 }
 
-define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) {
+define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28151,7 +28151,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28432,7 +28432,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) {
+define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28643,7 +28643,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28917,7 +28917,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29150,7 +29150,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29431,7 +29431,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
+define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30212,7 +30212,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31054,7 +31054,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) {
+define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31702,7 +31702,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i16_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32321,7 +32321,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
+define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33316,7 +33316,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34208,7 +34208,7 @@ end:
   ret <16 x i16> %phi
 }
 
-define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34474,7 +34474,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34784,7 +34784,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) {
+define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35583,7 +35583,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36472,7 +36472,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) {
+define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37127,7 +37127,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f16_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37780,7 +37780,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
+define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38775,7 +38775,7 @@ end:
   ret <16 x half> %phi
 }
 
-define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39667,7 +39667,7 @@ end:
   ret <16 x half> %phi
 }
 
-define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
+define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v32i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40847,7 +40847,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16bf16_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42164,7 +42164,7 @@ end:
   ret <32 x i8> %phi
 }
 
-define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43163,7 +43163,7 @@ end:
   ret <16 x bfloat> %phi
 }
 
-define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i8_to_v16bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44046,3 +44046,5 @@ end:
   %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <16 x bfloat> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
index 6656733d53e51..ead5d76b2e572 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) {
+define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v9f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -109,7 +109,7 @@ end:
   ret <9 x float> %phi
 }
 
-define inreg <9 x float> @bitcast_v9i32_to_v9f32_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v9i32_to_v9f32_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v9f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -249,7 +249,7 @@ end:
   ret <9 x float> %phi
 }
 
-define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) {
+define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v9i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -347,7 +347,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define inreg <9 x i32> @bitcast_v9f32_to_v9i32_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v9f32_to_v9i32_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v9i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,7 +494,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) {
+define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v18i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -654,7 +654,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v18i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -848,7 +848,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) {
+define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v9i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1071,7 +1071,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v9i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1325,7 +1325,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) {
+define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v18f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1485,7 +1485,7 @@ end:
   ret <18 x half> %phi
 }
 
-define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i32_to_v18f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1679,7 +1679,7 @@ end:
   ret <18 x half> %phi
 }
 
-define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) {
+define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v9i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1939,7 +1939,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v9i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2224,7 +2224,7 @@ end:
   ret <9 x i32> %phi
 }
 
-define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) {
+define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v18i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2379,7 +2379,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v18i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2607,7 +2607,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) {
+define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v9f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2830,7 +2830,7 @@ end:
   ret <9 x float> %phi
 }
 
-define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v9f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3084,7 +3084,7 @@ end:
   ret <9 x float> %phi
 }
 
-define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) {
+define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v18f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3239,7 +3239,7 @@ end:
   ret <18 x half> %phi
 }
 
-define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f32_to_v18f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3467,7 +3467,7 @@ end:
   ret <18 x half> %phi
 }
 
-define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) {
+define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v9f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3727,7 +3727,7 @@ end:
   ret <9 x float> %phi
 }
 
-define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v9f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4012,7 +4012,7 @@ end:
   ret <9 x float> %phi
 }
 
-define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) {
+define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v18f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4274,7 +4274,7 @@ end:
   ret <18 x half> %phi
 }
 
-define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i16_to_v18f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4590,7 +4590,7 @@ end:
   ret <18 x half> %phi
 }
 
-define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) {
+define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v18i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4819,7 +4819,7 @@ end:
   ret <18 x i16> %phi
 }
 
-define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f16_to_v18i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5133,6 +5133,9 @@ end:
   %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <18 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 9ae6700ac1825..442767fc1162d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) {
+define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -116,7 +116,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v10i32_to_v10f32_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v10i32_to_v10f32_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -263,7 +263,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) {
+define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -367,7 +367,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v10f32_to_v10i32_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v10f32_to_v10i32_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -521,7 +521,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) {
+define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -694,7 +694,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -901,7 +901,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) {
+define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1141,7 +1141,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1414,7 +1414,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
+define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1587,7 +1587,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1794,7 +1794,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
+define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2074,7 +2074,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2381,7 +2381,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
+define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3251,7 +3251,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4049,7 +4049,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
+define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5247,7 +5247,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6104,7 +6104,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
+define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6214,7 +6214,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v10i32_to_v5f64_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v10i32_to_v5f64_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6361,7 +6361,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) {
+define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6451,7 +6451,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v5f64_to_v10i32_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v5f64_to_v10i32_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6585,7 +6585,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
+define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6695,7 +6695,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v10i32_to_v5i64_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v10i32_to_v5i64_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i32_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6842,7 +6842,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
+define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v10i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6955,7 +6955,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define inreg <10 x i32> @bitcast_v5i64_to_v10i32_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v5i64_to_v10i32_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v10i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7102,7 +7102,7 @@ end:
   ret <10 x i32> %phi
 }
 
-define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) {
+define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7269,7 +7269,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7509,7 +7509,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) {
+define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7749,7 +7749,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8022,7 +8022,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) {
+define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8189,7 +8189,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8429,7 +8429,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) {
+define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8709,7 +8709,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9016,7 +9016,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
+define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9878,7 +9878,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10769,7 +10769,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
+define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11967,7 +11967,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12824,7 +12824,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) {
+define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12928,7 +12928,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v10f32_to_v5f64_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v10f32_to_v5f64_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13103,7 +13103,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) {
+define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13193,7 +13193,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v5f64_to_v10f32_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v5f64_to_v10f32_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13327,7 +13327,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) {
+define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13431,7 +13431,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v10f32_to_v5i64_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v10f32_to_v5i64_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f32_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13606,7 +13606,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) {
+define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v10f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13719,7 +13719,7 @@ end:
   ret <10 x float> %phi
 }
 
-define inreg <10 x float> @bitcast_v5i64_to_v10f32_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v5i64_to_v10f32_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v10f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13866,7 +13866,7 @@ end:
   ret <10 x float> %phi
 }
 
-define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) {
+define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14151,7 +14151,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14492,7 +14492,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) {
+define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14739,7 +14739,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15076,7 +15076,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
+define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16111,7 +16111,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16119,10 +16119,10 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s30, 0
-; SI-NEXT:    v_writelane_b32 v3, s31, 1
-; SI-NEXT:    v_writelane_b32 v3, s34, 2
-; SI-NEXT:    v_writelane_b32 v3, s35, 3
+; SI-NEXT:    v_writelane_b32 v3, s34, 0
+; SI-NEXT:    v_writelane_b32 v3, s35, 1
+; SI-NEXT:    v_writelane_b32 v3, s30, 2
+; SI-NEXT:    v_writelane_b32 v3, s31, 3
 ; SI-NEXT:    s_lshr_b32 s90, s25, 16
 ; SI-NEXT:    s_lshr_b32 s35, s24, 16
 ; SI-NEXT:    s_lshr_b32 s91, s23, 16
@@ -16400,11 +16400,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v3, 2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s35, v3, 3
-; SI-NEXT:    v_readlane_b32 s34, v3, 2
-; SI-NEXT:    v_readlane_b32 s31, v3, 1
-; SI-NEXT:    v_readlane_b32 s30, v3, 0
+; SI-NEXT:    v_readlane_b32 s31, v3, 3
+; SI-NEXT:    v_readlane_b32 s35, v3, 1
+; SI-NEXT:    v_readlane_b32 s34, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17082,7 +17082,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
+define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18418,7 +18418,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18426,14 +18426,15 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v27, s30, 0
-; SI-NEXT:    v_writelane_b32 v27, s31, 1
-; SI-NEXT:    v_writelane_b32 v27, s34, 2
-; SI-NEXT:    v_writelane_b32 v27, s35, 3
-; SI-NEXT:    v_writelane_b32 v27, s36, 4
-; SI-NEXT:    v_writelane_b32 v27, s37, 5
+; SI-NEXT:    v_writelane_b32 v27, s34, 0
+; SI-NEXT:    v_writelane_b32 v27, s35, 1
+; SI-NEXT:    v_writelane_b32 v27, s36, 2
+; SI-NEXT:    v_writelane_b32 v27, s37, 3
+; SI-NEXT:    v_writelane_b32 v27, s38, 4
+; SI-NEXT:    v_writelane_b32 v27, s39, 5
+; SI-NEXT:    v_writelane_b32 v27, s30, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v27, s38, 6
+; SI-NEXT:    v_writelane_b32 v27, s31, 7
 ; SI-NEXT:    v_readfirstlane_b32 s90, v25
 ; SI-NEXT:    v_readfirstlane_b32 s91, v24
 ; SI-NEXT:    v_readfirstlane_b32 s93, v23
@@ -18461,7 +18462,6 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_readfirstlane_b32 s58, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s59, v0
-; SI-NEXT:    v_writelane_b32 v27, s39, 7
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -18742,6 +18742,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s14, s15, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v27, 6
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v1, s13
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
@@ -18752,14 +18753,13 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v7, s7
 ; SI-NEXT:    v_mov_b32_e32 v8, s4
 ; SI-NEXT:    v_mov_b32_e32 v9, s5
-; SI-NEXT:    v_readlane_b32 s39, v27, 7
-; SI-NEXT:    v_readlane_b32 s38, v27, 6
-; SI-NEXT:    v_readlane_b32 s37, v27, 5
-; SI-NEXT:    v_readlane_b32 s36, v27, 4
-; SI-NEXT:    v_readlane_b32 s35, v27, 3
-; SI-NEXT:    v_readlane_b32 s34, v27, 2
-; SI-NEXT:    v_readlane_b32 s31, v27, 1
-; SI-NEXT:    v_readlane_b32 s30, v27, 0
+; SI-NEXT:    v_readlane_b32 s31, v27, 7
+; SI-NEXT:    v_readlane_b32 s39, v27, 5
+; SI-NEXT:    v_readlane_b32 s38, v27, 4
+; SI-NEXT:    v_readlane_b32 s37, v27, 3
+; SI-NEXT:    v_readlane_b32 s36, v27, 2
+; SI-NEXT:    v_readlane_b32 s35, v27, 1
+; SI-NEXT:    v_readlane_b32 s34, v27, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19524,7 +19524,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) {
+define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19764,7 +19764,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20070,7 +20070,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) {
+define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20223,7 +20223,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20443,7 +20443,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) {
+define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20683,7 +20683,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i16_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20989,7 +20989,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
+define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v20i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21165,7 +21165,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21372,7 +21372,7 @@ end:
   ret <20 x i16> %phi
 }
 
-define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
+define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22391,7 +22391,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22399,10 +22399,10 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v12, s30, 0
-; SI-NEXT:    v_writelane_b32 v12, s31, 1
-; SI-NEXT:    v_writelane_b32 v12, s34, 2
-; SI-NEXT:    v_writelane_b32 v12, s35, 3
+; SI-NEXT:    v_writelane_b32 v12, s34, 0
+; SI-NEXT:    v_writelane_b32 v12, s35, 1
+; SI-NEXT:    v_writelane_b32 v12, s30, 2
+; SI-NEXT:    v_writelane_b32 v12, s31, 3
 ; SI-NEXT:    s_lshr_b32 s34, s25, 16
 ; SI-NEXT:    s_lshr_b32 s35, s24, 16
 ; SI-NEXT:    s_lshr_b32 s30, s23, 16
@@ -22760,11 +22760,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; SI-NEXT:    v_readlane_b32 s30, v12, 2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s35, v12, 3
-; SI-NEXT:    v_readlane_b32 s34, v12, 2
-; SI-NEXT:    v_readlane_b32 s31, v12, 1
-; SI-NEXT:    v_readlane_b32 s30, v12, 0
+; SI-NEXT:    v_readlane_b32 s31, v12, 3
+; SI-NEXT:    v_readlane_b32 s35, v12, 1
+; SI-NEXT:    v_readlane_b32 s34, v12, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23424,7 +23424,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
+define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24760,7 +24760,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24768,14 +24768,15 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v27, s30, 0
-; SI-NEXT:    v_writelane_b32 v27, s31, 1
-; SI-NEXT:    v_writelane_b32 v27, s34, 2
-; SI-NEXT:    v_writelane_b32 v27, s35, 3
-; SI-NEXT:    v_writelane_b32 v27, s36, 4
-; SI-NEXT:    v_writelane_b32 v27, s37, 5
+; SI-NEXT:    v_writelane_b32 v27, s34, 0
+; SI-NEXT:    v_writelane_b32 v27, s35, 1
+; SI-NEXT:    v_writelane_b32 v27, s36, 2
+; SI-NEXT:    v_writelane_b32 v27, s37, 3
+; SI-NEXT:    v_writelane_b32 v27, s38, 4
+; SI-NEXT:    v_writelane_b32 v27, s39, 5
+; SI-NEXT:    v_writelane_b32 v27, s30, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v26
-; SI-NEXT:    v_writelane_b32 v27, s38, 6
+; SI-NEXT:    v_writelane_b32 v27, s31, 7
 ; SI-NEXT:    v_readfirstlane_b32 s90, v25
 ; SI-NEXT:    v_readfirstlane_b32 s91, v24
 ; SI-NEXT:    v_readfirstlane_b32 s93, v23
@@ -24803,7 +24804,6 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_readfirstlane_b32 s58, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s59, v0
-; SI-NEXT:    v_writelane_b32 v27, s39, 7
 ; SI-NEXT:    s_cbranch_scc0 .LBB63_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -25084,6 +25084,7 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s14, s15, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v27, 6
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v1, s13
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
@@ -25094,14 +25095,13 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v7, s7
 ; SI-NEXT:    v_mov_b32_e32 v8, s4
 ; SI-NEXT:    v_mov_b32_e32 v9, s5
-; SI-NEXT:    v_readlane_b32 s39, v27, 7
-; SI-NEXT:    v_readlane_b32 s38, v27, 6
-; SI-NEXT:    v_readlane_b32 s37, v27, 5
-; SI-NEXT:    v_readlane_b32 s36, v27, 4
-; SI-NEXT:    v_readlane_b32 s35, v27, 3
-; SI-NEXT:    v_readlane_b32 s34, v27, 2
-; SI-NEXT:    v_readlane_b32 s31, v27, 1
-; SI-NEXT:    v_readlane_b32 s30, v27, 0
+; SI-NEXT:    v_readlane_b32 s31, v27, 7
+; SI-NEXT:    v_readlane_b32 s39, v27, 5
+; SI-NEXT:    v_readlane_b32 s38, v27, 4
+; SI-NEXT:    v_readlane_b32 s37, v27, 3
+; SI-NEXT:    v_readlane_b32 s36, v27, 2
+; SI-NEXT:    v_readlane_b32 s35, v27, 1
+; SI-NEXT:    v_readlane_b32 s34, v27, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25866,7 +25866,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
+define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26146,7 +26146,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26499,7 +26499,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
+define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26652,7 +26652,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26872,7 +26872,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
+define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27152,7 +27152,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f16_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27505,7 +27505,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
+define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27681,7 +27681,7 @@ end:
   ret <20 x half> %phi
 }
 
-define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v20f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27888,7 +27888,7 @@ end:
   ret <20 x half> %phi
 }
 
-define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
+define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29140,7 +29140,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30021,7 +30021,7 @@ end:
   ret <5 x double> %phi
 }
 
-define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
+define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30866,7 +30866,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31732,7 +31732,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
+define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32984,7 +32984,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i8_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33865,7 +33865,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
+define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v40i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34741,7 +34741,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35539,7 +35539,7 @@ end:
   ret <40 x i8> %phi
 }
 
-define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) {
+define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35629,7 +35629,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define inreg <5 x i64> @bitcast_v5f64_to_v5i64_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v5f64_to_v5i64_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5f64_to_v5i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35784,7 +35784,7 @@ end:
   ret <5 x i64> %phi
 }
 
-define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
+define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35897,7 +35897,7 @@ end:
   ret <5 x double> %phi
 }
 
-define inreg <5 x double> @bitcast_v5i64_to_v5f64_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v5i64_to_v5f64_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v5i64_to_v5f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36042,3 +36042,5 @@ end:
   %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <5 x double> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 0a8af1ab3e547..123d1042e27c9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define float @bitcast_i32_to_f32(i32 %a, i32 %b) {
+define float @bitcast_i32_to_f32(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -76,7 +76,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_i32_to_f32_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg float @bitcast_i32_to_f32_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -157,7 +157,7 @@ end:
   ret float %phi
 }
 
-define i32 @bitcast_f32_to_i32(float %a, i32 %b) {
+define i32 @bitcast_f32_to_i32(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,7 +227,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_f32_to_i32_scalar(float inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_f32_to_i32_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -310,7 +310,7 @@ end:
   ret i32 %phi
 }
 
-define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) {
+define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -388,7 +388,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -475,7 +475,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) {
+define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -567,7 +567,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -661,7 +661,7 @@ end:
   ret i32 %phi
 }
 
-define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) {
+define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -739,7 +739,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,7 +826,7 @@ end:
   ret <2 x half> %phi
 }
 
-define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) {
+define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -918,7 +918,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1019,7 +1019,7 @@ end:
   ret i32 %phi
 }
 
-define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) {
+define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1102,7 +1102,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1192,7 +1192,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
+define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1376,7 +1376,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1571,7 +1571,7 @@ end:
   ret i32 %phi
 }
 
-define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) {
+define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1641,7 +1641,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_i32_to_v1i32_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_i32_to_v1i32_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1722,7 +1722,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) {
+define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1792,7 +1792,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v1i32_to_i32_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v1i32_to_i32_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1873,7 +1873,7 @@ end:
   ret i32 %phi
 }
 
-define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) {
+define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2039,7 +2039,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_i32_to_v4i8_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_i32_to_v4i8_scalar(i32 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i32_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2166,7 +2166,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
+define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2401,7 +2401,7 @@ end:
   ret i32 %phi
 }
 
-define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2555,7 +2555,7 @@ end:
   ret i32 %phi
 }
 
-define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) {
+define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2633,7 +2633,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2724,7 +2724,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) {
+define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2816,7 +2816,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2910,7 +2910,7 @@ end:
   ret float %phi
 }
 
-define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) {
+define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2988,7 +2988,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3079,7 +3079,7 @@ end:
   ret <2 x half> %phi
 }
 
-define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) {
+define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3171,7 +3171,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3272,7 +3272,7 @@ end:
   ret float %phi
 }
 
-define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) {
+define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3355,7 +3355,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3450,7 +3450,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
+define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3634,7 +3634,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3829,7 +3829,7 @@ end:
   ret float %phi
 }
 
-define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) {
+define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3899,7 +3899,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_f32_to_v1i32_scalar(float inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_f32_to_v1i32_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3982,7 +3982,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) {
+define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4052,7 +4052,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v1i32_to_f32_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v1i32_to_f32_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4133,7 +4133,7 @@ end:
   ret float %phi
 }
 
-define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) {
+define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4299,7 +4299,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_f32_to_v4i8_scalar(float inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_f32_to_v4i8_scalar(float inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f32_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4429,7 +4429,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
+define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4664,7 +4664,7 @@ end:
   ret float %phi
 }
 
-define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4818,7 +4818,7 @@ end:
   ret float %phi
 }
 
-define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) {
+define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4907,7 +4907,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5005,7 +5005,7 @@ end:
   ret <2 x half> %phi
 }
 
-define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) {
+define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5091,7 +5091,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5193,7 +5193,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5274,7 +5274,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5370,7 +5370,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
+define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5545,7 +5545,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5737,7 +5737,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) {
+define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5829,7 +5829,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5923,7 +5923,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) {
+define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6001,7 +6001,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6088,7 +6088,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) {
+define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6269,7 +6269,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i16_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6410,7 +6410,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
+define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6645,7 +6645,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6805,7 +6805,7 @@ end:
   ret <2 x i16> %phi
 }
 
-define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6900,7 +6900,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7007,7 +7007,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
+define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7189,7 +7189,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7387,7 +7387,7 @@ end:
   ret <2 x half> %phi
 }
 
-define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) {
+define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7479,7 +7479,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7580,7 +7580,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) {
+define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7658,7 +7658,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7745,7 +7745,7 @@ end:
   ret <2 x half> %phi
 }
 
-define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) {
+define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7925,7 +7925,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f16_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8069,7 +8069,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
+define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8304,7 +8304,7 @@ end:
   ret <2 x half> %phi
 }
 
-define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8464,7 +8464,7 @@ end:
   ret <2 x half> %phi
 }
 
-define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
+define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8648,7 +8648,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8843,7 +8843,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8926,7 +8926,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9016,7 +9016,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
+define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9267,7 +9267,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2bf16_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9515,7 +9515,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9747,7 +9747,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v2bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9903,7 +9903,7 @@ end:
   ret <2 x bfloat> %phi
 }
 
-define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) {
+define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10069,7 +10069,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define inreg <4 x i8> @bitcast_v1i32_to_v4i8_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v1i32_to_v4i8_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v1i32_to_v4i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10196,7 +10196,7 @@ end:
   ret <4 x i8> %phi
 }
 
-define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
+define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10431,7 +10431,7 @@ end:
   ret <1 x i32> %phi
 }
 
-define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i8_to_v1i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10584,3 +10584,5 @@ end:
   %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <1 x i32> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
index 70ed2ca42b706..79c9fc7faf339 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) {
+define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v11f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,7 +120,7 @@ end:
   ret <11 x float> %phi
 }
 
-define inreg <11 x float> @bitcast_v11i32_to_v11f32_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v11i32_to_v11f32_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v11f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -275,7 +275,7 @@ end:
   ret <11 x float> %phi
 }
 
-define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) {
+define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v11i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -383,7 +383,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define inreg <11 x i32> @bitcast_v11f32_to_v11i32_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v11f32_to_v11i32_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v11i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -545,7 +545,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) {
+define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v22i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -728,7 +728,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v22i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -949,7 +949,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) {
+define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v11i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1205,7 +1205,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v11i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1498,7 +1498,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) {
+define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v22f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1681,7 +1681,7 @@ end:
   ret <22 x half> %phi
 }
 
-define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i32_to_v22f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1902,7 +1902,7 @@ end:
   ret <22 x half> %phi
 }
 
-define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) {
+define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v11i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2202,7 +2202,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v11i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2532,7 +2532,7 @@ end:
   ret <11 x i32> %phi
 }
 
-define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) {
+define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v22i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2709,7 +2709,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v22i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2961,7 +2961,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) {
+define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v11f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3217,7 +3217,7 @@ end:
   ret <11 x float> %phi
 }
 
-define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v11f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3510,7 +3510,7 @@ end:
   ret <11 x float> %phi
 }
 
-define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) {
+define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v22f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3687,7 +3687,7 @@ end:
   ret <22 x half> %phi
 }
 
-define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f32_to_v22f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3939,7 +3939,7 @@ end:
   ret <22 x half> %phi
 }
 
-define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) {
+define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v11f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4239,7 +4239,7 @@ end:
   ret <11 x float> %phi
 }
 
-define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v11f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4569,7 +4569,7 @@ end:
   ret <11 x float> %phi
 }
 
-define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) {
+define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v22f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4874,7 +4874,7 @@ end:
   ret <22 x half> %phi
 }
 
-define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i16_to_v22f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5238,7 +5238,7 @@ end:
   ret <22 x half> %phi
 }
 
-define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) {
+define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v22i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5502,7 +5502,7 @@ end:
   ret <22 x i16> %phi
 }
 
-define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f16_to_v22i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5859,6 +5859,9 @@ end:
   %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <22 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
index 60c5431f7e4c6..cb0e72323a165 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) {
+define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -124,7 +124,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v12i32_to_v12f32_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v12i32_to_v12f32_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -286,7 +286,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) {
+define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -397,7 +397,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v12f32_to_v12i32_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v12f32_to_v12i32_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -566,7 +566,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) {
+define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -684,7 +684,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v12i32_to_v6f64_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v12i32_to_v6f64_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -846,7 +846,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) {
+define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -940,7 +940,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v6f64_to_v12i32_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v6f64_to_v12i32_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1085,7 +1085,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) {
+define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1203,7 +1203,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v12i32_to_v6i64_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v12i32_to_v6i64_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1365,7 +1365,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) {
+define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1486,7 +1486,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v6i64_to_v12i32_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v6i64_to_v12i32_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1648,7 +1648,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) {
+define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1841,7 +1841,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2075,7 +2075,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) {
+define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2347,7 +2347,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2659,7 +2659,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) {
+define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2852,7 +2852,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i32_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3086,7 +3086,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) {
+define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v12i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3406,7 +3406,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v12i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3758,7 +3758,7 @@ end:
   ret <12 x i32> %phi
 }
 
-define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) {
+define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3869,7 +3869,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v12f32_to_v6f64_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v12f32_to_v6f64_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4052,7 +4052,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) {
+define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4146,7 +4146,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v6f64_to_v12f32_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v6f64_to_v12f32_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4291,7 +4291,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) {
+define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4402,7 +4402,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v12f32_to_v6i64_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v12f32_to_v6i64_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4585,7 +4585,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) {
+define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4706,7 +4706,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v6i64_to_v12f32_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v6i64_to_v12f32_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4868,7 +4868,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) {
+define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5054,7 +5054,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5318,7 +5318,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) {
+define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5590,7 +5590,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5902,7 +5902,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) {
+define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6088,7 +6088,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f32_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6352,7 +6352,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) {
+define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v12f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6672,7 +6672,7 @@ end:
   ret <12 x float> %phi
 }
 
-define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v12f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7024,7 +7024,7 @@ end:
   ret <12 x float> %phi
 }
 
-define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) {
+define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7118,7 +7118,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v6f64_to_v6i64_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v6f64_to_v6i64_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7277,7 +7277,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) {
+define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7398,7 +7398,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v6i64_to_v6f64_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v6i64_to_v6f64_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7559,7 +7559,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) {
+define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7728,7 +7728,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7968,7 +7968,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) {
+define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8240,7 +8240,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8582,7 +8582,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) {
+define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8751,7 +8751,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f64_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8991,7 +8991,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) {
+define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v6f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9311,7 +9311,7 @@ end:
   ret <6 x double> %phi
 }
 
-define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v6f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9702,7 +9702,7 @@ end:
   ret <6 x double> %phi
 }
 
-define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) {
+define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9898,7 +9898,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10132,7 +10132,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) {
+define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10404,7 +10404,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10746,7 +10746,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) {
+define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10942,7 +10942,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i64_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11176,7 +11176,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) {
+define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v6i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11496,7 +11496,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v6i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11887,7 +11887,7 @@ end:
   ret <6 x i64> %phi
 }
 
-define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) {
+define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v24f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12214,7 +12214,7 @@ end:
   ret <24 x half> %phi
 }
 
-define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i16_to_v24f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12603,7 +12603,7 @@ end:
   ret <24 x half> %phi
 }
 
-define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) {
+define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v24i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12885,7 +12885,7 @@ end:
   ret <24 x i16> %phi
 }
 
-define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13264,6 +13264,9 @@ end:
   %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <24 x i16> %phi
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
index cd47411c7e51c..f7b7547cad1bd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <14 x float> @bitcast_v14i32_to_v14f32(<14 x i32> %a, i32 %b) {
+define <14 x float> @bitcast_v14i32_to_v14f32(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -132,7 +132,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v14i32_to_v14f32_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v14i32_to_v14f32_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -312,7 +312,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) {
+define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -431,7 +431,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v14f32_to_v14i32_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v14f32_to_v14i32_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -625,7 +625,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) {
+define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -751,7 +751,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v14i32_to_v7i64_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v14i32_to_v7i64_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -931,7 +931,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) {
+define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1061,7 +1061,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v7i64_to_v14i32_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v7i64_to_v14i32_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1241,7 +1241,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) {
+define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1367,7 +1367,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v14i32_to_v7f64_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v14i32_to_v7f64_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1547,7 +1547,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) {
+define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1645,7 +1645,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v7f64_to_v14i32_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v7f64_to_v14i32_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1811,7 +1811,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) {
+define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2024,7 +2024,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2288,7 +2288,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) {
+define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2592,7 +2592,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2973,7 +2973,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) {
+define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3186,7 +3186,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i32_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3450,7 +3450,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) {
+define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v14i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3810,7 +3810,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v14i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4242,7 +4242,7 @@ end:
   ret <14 x i32> %phi
 }
 
-define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) {
+define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4361,7 +4361,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v14f32_to_v7i64_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v14f32_to_v7i64_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4555,7 +4555,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) {
+define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4685,7 +4685,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v7i64_to_v14f32_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v7i64_to_v14f32_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4865,7 +4865,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) {
+define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4984,7 +4984,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v14f32_to_v7f64_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v14f32_to_v7f64_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5178,7 +5178,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) {
+define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5276,7 +5276,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v7f64_to_v14f32_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v7f64_to_v14f32_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5442,7 +5442,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) {
+define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5648,7 +5648,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5939,7 +5939,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) {
+define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6243,7 +6243,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6624,7 +6624,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) {
+define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6830,7 +6830,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f32_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7121,7 +7121,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) {
+define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v14f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7481,7 +7481,7 @@ end:
   ret <14 x float> %phi
 }
 
-define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v14f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7913,7 +7913,7 @@ end:
   ret <14 x float> %phi
 }
 
-define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) {
+define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8043,7 +8043,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v7i64_to_v7f64_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v7i64_to_v7f64_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8222,7 +8222,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) {
+define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8320,7 +8320,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v7f64_to_v7i64_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v7f64_to_v7i64_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8486,7 +8486,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) {
+define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8703,7 +8703,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8967,7 +8967,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) {
+define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9271,7 +9271,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9652,7 +9652,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) {
+define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9869,7 +9869,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7i64_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10133,7 +10133,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) {
+define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v7i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10493,7 +10493,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v7i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10925,7 +10925,7 @@ end:
   ret <7 x i64> %phi
 }
 
-define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) {
+define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11110,7 +11110,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11373,7 +11373,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) {
+define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11677,7 +11677,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12058,7 +12058,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) {
+define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12243,7 +12243,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v7f64_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12506,7 +12506,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) {
+define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v7f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12866,7 +12866,7 @@ end:
   ret <7 x double> %phi
 }
 
-define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v7f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13298,7 +13298,7 @@ end:
   ret <7 x double> %phi
 }
 
-define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) {
+define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v28f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13688,7 +13688,7 @@ end:
   ret <28 x half> %phi
 }
 
-define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i16_to_v28f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14128,7 +14128,7 @@ end:
   ret <28 x half> %phi
 }
 
-define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) {
+define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v28i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14445,7 +14445,7 @@ end:
   ret <28 x i16> %phi
 }
 
-define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14870,6 +14870,8 @@ end:
   %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <28 x i16> %phi
 }
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-FAKE16: {{.*}}
 ; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index 5697cb1e909c8..62cd5098d32f2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
+define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3bf16_to_v3f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -242,7 +242,7 @@ end:
   ret <3 x half> %phi
 }
 
-define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3bf16_to_v3f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -500,7 +500,7 @@ end:
   ret <3 x half> %phi
 }
 
-define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) {
+define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f16_to_v3bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -608,7 +608,7 @@ end:
   ret <3 x bfloat> %phi
 }
 
-define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f16_to_v3bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -730,7 +730,7 @@ end:
   ret <3 x bfloat> %phi
 }
 
-define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
+define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3bf16_to_v3i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -958,7 +958,7 @@ end:
   ret <3 x i16> %phi
 }
 
-define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3bf16_to_v3i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1207,7 +1207,7 @@ end:
   ret <3 x i16> %phi
 }
 
-define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) {
+define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i16_to_v3bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1303,7 +1303,7 @@ end:
   ret <3 x bfloat> %phi
 }
 
-define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i16_to_v3bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1413,7 +1413,7 @@ end:
   ret <3 x bfloat> %phi
 }
 
-define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) {
+define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f16_to_v3i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1507,7 +1507,7 @@ end:
   ret <3 x i16> %phi
 }
 
-define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f16_to_v3i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1620,7 +1620,7 @@ end:
   ret <3 x i16> %phi
 }
 
-define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) {
+define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i16_to_v3f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1720,7 +1720,7 @@ end:
   ret <3 x half> %phi
 }
 
-define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i16_to_v3f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1830,3 +1830,5 @@ end:
   %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <3 x half> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index cdd050175c8ba..fd08154118f5c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs=0 < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) {
+define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -140,7 +140,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v16i32_to_v16f32_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v16i32_to_v16f32_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -341,7 +341,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) {
+define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -467,7 +467,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -522,8 +522,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB3_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -583,8 +583,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB3_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -644,8 +644,8 @@ define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB3_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -712,7 +712,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) {
+define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -846,7 +846,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v16i32_to_v8i64_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v16i32_to_v8i64_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1047,7 +1047,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) {
+define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1185,7 +1185,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v8i64_to_v16i32_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v8i64_to_v16i32_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1386,7 +1386,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) {
+define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1520,7 +1520,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v16i32_to_v8f64_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v16i32_to_v8f64_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1721,7 +1721,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) {
+define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1823,7 +1823,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1870,8 +1870,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB11_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1923,8 +1923,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB11_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1976,8 +1976,8 @@ define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB11_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2036,7 +2036,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) {
+define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2269,7 +2269,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2566,7 +2566,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
+define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2902,7 +2902,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3269,8 +3269,8 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB15_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -3337,7 +3337,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) {
+define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3570,7 +3570,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3867,7 +3867,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) {
+define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4267,7 +4267,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4637,8 +4637,8 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    v_readlane_b32 s30, v17, 0
+; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4699,8 +4699,8 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB19_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4767,7 +4767,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5080,7 +5080,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5425,7 +5425,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
+define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6831,7 +6831,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7411,8 +7411,8 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB23_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -7762,8 +7762,8 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB23_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8477,7 +8477,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
+define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9944,7 +9944,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16i32_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9952,40 +9952,40 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v4, s30, 0
-; SI-NEXT:    v_writelane_b32 v4, s31, 1
-; SI-NEXT:    v_writelane_b32 v4, s34, 2
-; SI-NEXT:    v_writelane_b32 v4, s35, 3
-; SI-NEXT:    v_writelane_b32 v4, s36, 4
-; SI-NEXT:    v_writelane_b32 v4, s37, 5
-; SI-NEXT:    v_writelane_b32 v4, s38, 6
-; SI-NEXT:    v_writelane_b32 v4, s39, 7
-; SI-NEXT:    v_writelane_b32 v4, s48, 8
-; SI-NEXT:    v_writelane_b32 v4, s49, 9
-; SI-NEXT:    v_writelane_b32 v4, s50, 10
-; SI-NEXT:    v_writelane_b32 v4, s51, 11
-; SI-NEXT:    v_writelane_b32 v4, s52, 12
-; SI-NEXT:    v_writelane_b32 v4, s53, 13
-; SI-NEXT:    v_writelane_b32 v4, s54, 14
-; SI-NEXT:    v_writelane_b32 v4, s55, 15
-; SI-NEXT:    v_writelane_b32 v4, s64, 16
-; SI-NEXT:    v_writelane_b32 v4, s65, 17
-; SI-NEXT:    v_writelane_b32 v4, s66, 18
-; SI-NEXT:    v_writelane_b32 v4, s67, 19
-; SI-NEXT:    v_writelane_b32 v4, s68, 20
-; SI-NEXT:    v_writelane_b32 v4, s69, 21
-; SI-NEXT:    v_writelane_b32 v4, s70, 22
-; SI-NEXT:    v_writelane_b32 v4, s71, 23
-; SI-NEXT:    v_writelane_b32 v4, s80, 24
-; SI-NEXT:    v_writelane_b32 v4, s81, 25
-; SI-NEXT:    v_writelane_b32 v4, s82, 26
-; SI-NEXT:    v_writelane_b32 v4, s83, 27
+; SI-NEXT:    v_writelane_b32 v4, s34, 0
+; SI-NEXT:    v_writelane_b32 v4, s35, 1
+; SI-NEXT:    v_writelane_b32 v4, s36, 2
+; SI-NEXT:    v_writelane_b32 v4, s37, 3
+; SI-NEXT:    v_writelane_b32 v4, s38, 4
+; SI-NEXT:    v_writelane_b32 v4, s39, 5
+; SI-NEXT:    v_writelane_b32 v4, s48, 6
+; SI-NEXT:    v_writelane_b32 v4, s49, 7
+; SI-NEXT:    v_writelane_b32 v4, s50, 8
+; SI-NEXT:    v_writelane_b32 v4, s51, 9
+; SI-NEXT:    v_writelane_b32 v4, s52, 10
+; SI-NEXT:    v_writelane_b32 v4, s53, 11
+; SI-NEXT:    v_writelane_b32 v4, s54, 12
+; SI-NEXT:    v_writelane_b32 v4, s55, 13
+; SI-NEXT:    v_writelane_b32 v4, s64, 14
+; SI-NEXT:    v_writelane_b32 v4, s65, 15
+; SI-NEXT:    v_writelane_b32 v4, s66, 16
+; SI-NEXT:    v_writelane_b32 v4, s67, 17
+; SI-NEXT:    v_writelane_b32 v4, s68, 18
+; SI-NEXT:    v_writelane_b32 v4, s69, 19
+; SI-NEXT:    v_writelane_b32 v4, s70, 20
+; SI-NEXT:    v_writelane_b32 v4, s71, 21
+; SI-NEXT:    v_writelane_b32 v4, s80, 22
+; SI-NEXT:    v_writelane_b32 v4, s81, 23
+; SI-NEXT:    v_writelane_b32 v4, s82, 24
+; SI-NEXT:    v_writelane_b32 v4, s83, 25
+; SI-NEXT:    v_writelane_b32 v4, s84, 26
+; SI-NEXT:    v_writelane_b32 v4, s85, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
-; SI-NEXT:    v_writelane_b32 v4, s84, 28
+; SI-NEXT:    v_writelane_b32 v4, s30, 28
 ; SI-NEXT:    v_readfirstlane_b32 s5, v2
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v1
-; SI-NEXT:    v_writelane_b32 v4, s85, 29
+; SI-NEXT:    v_writelane_b32 v4, s31, 29
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 24
@@ -10306,37 +10306,37 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v4, 28
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s85, v4, 29
-; SI-NEXT:    v_readlane_b32 s84, v4, 28
-; SI-NEXT:    v_readlane_b32 s83, v4, 27
-; SI-NEXT:    v_readlane_b32 s82, v4, 26
-; SI-NEXT:    v_readlane_b32 s81, v4, 25
-; SI-NEXT:    v_readlane_b32 s80, v4, 24
-; SI-NEXT:    v_readlane_b32 s71, v4, 23
-; SI-NEXT:    v_readlane_b32 s70, v4, 22
-; SI-NEXT:    v_readlane_b32 s69, v4, 21
-; SI-NEXT:    v_readlane_b32 s68, v4, 20
-; SI-NEXT:    v_readlane_b32 s67, v4, 19
-; SI-NEXT:    v_readlane_b32 s66, v4, 18
-; SI-NEXT:    v_readlane_b32 s65, v4, 17
-; SI-NEXT:    v_readlane_b32 s64, v4, 16
-; SI-NEXT:    v_readlane_b32 s55, v4, 15
-; SI-NEXT:    v_readlane_b32 s54, v4, 14
-; SI-NEXT:    v_readlane_b32 s53, v4, 13
-; SI-NEXT:    v_readlane_b32 s52, v4, 12
-; SI-NEXT:    v_readlane_b32 s51, v4, 11
-; SI-NEXT:    v_readlane_b32 s50, v4, 10
-; SI-NEXT:    v_readlane_b32 s49, v4, 9
-; SI-NEXT:    v_readlane_b32 s48, v4, 8
-; SI-NEXT:    v_readlane_b32 s39, v4, 7
-; SI-NEXT:    v_readlane_b32 s38, v4, 6
-; SI-NEXT:    v_readlane_b32 s37, v4, 5
-; SI-NEXT:    v_readlane_b32 s36, v4, 4
-; SI-NEXT:    v_readlane_b32 s35, v4, 3
-; SI-NEXT:    v_readlane_b32 s34, v4, 2
-; SI-NEXT:    v_readlane_b32 s31, v4, 1
-; SI-NEXT:    v_readlane_b32 s30, v4, 0
+; SI-NEXT:    v_readlane_b32 s31, v4, 29
+; SI-NEXT:    v_readlane_b32 s85, v4, 27
+; SI-NEXT:    v_readlane_b32 s84, v4, 26
+; SI-NEXT:    v_readlane_b32 s83, v4, 25
+; SI-NEXT:    v_readlane_b32 s82, v4, 24
+; SI-NEXT:    v_readlane_b32 s81, v4, 23
+; SI-NEXT:    v_readlane_b32 s80, v4, 22
+; SI-NEXT:    v_readlane_b32 s71, v4, 21
+; SI-NEXT:    v_readlane_b32 s70, v4, 20
+; SI-NEXT:    v_readlane_b32 s69, v4, 19
+; SI-NEXT:    v_readlane_b32 s68, v4, 18
+; SI-NEXT:    v_readlane_b32 s67, v4, 17
+; SI-NEXT:    v_readlane_b32 s66, v4, 16
+; SI-NEXT:    v_readlane_b32 s65, v4, 15
+; SI-NEXT:    v_readlane_b32 s64, v4, 14
+; SI-NEXT:    v_readlane_b32 s55, v4, 13
+; SI-NEXT:    v_readlane_b32 s54, v4, 12
+; SI-NEXT:    v_readlane_b32 s53, v4, 11
+; SI-NEXT:    v_readlane_b32 s52, v4, 10
+; SI-NEXT:    v_readlane_b32 s51, v4, 9
+; SI-NEXT:    v_readlane_b32 s50, v4, 8
+; SI-NEXT:    v_readlane_b32 s49, v4, 7
+; SI-NEXT:    v_readlane_b32 s48, v4, 6
+; SI-NEXT:    v_readlane_b32 s39, v4, 5
+; SI-NEXT:    v_readlane_b32 s38, v4, 4
+; SI-NEXT:    v_readlane_b32 s37, v4, 3
+; SI-NEXT:    v_readlane_b32 s36, v4, 2
+; SI-NEXT:    v_readlane_b32 s35, v4, 1
+; SI-NEXT:    v_readlane_b32 s34, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10399,30 +10399,30 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v4, s30, 0
-; VI-NEXT:    v_writelane_b32 v4, s31, 1
-; VI-NEXT:    v_writelane_b32 v4, s34, 2
-; VI-NEXT:    v_writelane_b32 v4, s35, 3
-; VI-NEXT:    v_writelane_b32 v4, s36, 4
-; VI-NEXT:    v_writelane_b32 v4, s37, 5
-; VI-NEXT:    v_writelane_b32 v4, s38, 6
-; VI-NEXT:    v_writelane_b32 v4, s39, 7
-; VI-NEXT:    v_writelane_b32 v4, s48, 8
-; VI-NEXT:    v_writelane_b32 v4, s49, 9
-; VI-NEXT:    v_writelane_b32 v4, s50, 10
-; VI-NEXT:    v_writelane_b32 v4, s51, 11
-; VI-NEXT:    v_writelane_b32 v4, s52, 12
-; VI-NEXT:    v_writelane_b32 v4, s53, 13
-; VI-NEXT:    v_writelane_b32 v4, s54, 14
-; VI-NEXT:    v_writelane_b32 v4, s55, 15
-; VI-NEXT:    v_writelane_b32 v4, s64, 16
-; VI-NEXT:    v_writelane_b32 v4, s65, 17
+; VI-NEXT:    v_writelane_b32 v4, s34, 0
+; VI-NEXT:    v_writelane_b32 v4, s35, 1
+; VI-NEXT:    v_writelane_b32 v4, s36, 2
+; VI-NEXT:    v_writelane_b32 v4, s37, 3
+; VI-NEXT:    v_writelane_b32 v4, s38, 4
+; VI-NEXT:    v_writelane_b32 v4, s39, 5
+; VI-NEXT:    v_writelane_b32 v4, s48, 6
+; VI-NEXT:    v_writelane_b32 v4, s49, 7
+; VI-NEXT:    v_writelane_b32 v4, s50, 8
+; VI-NEXT:    v_writelane_b32 v4, s51, 9
+; VI-NEXT:    v_writelane_b32 v4, s52, 10
+; VI-NEXT:    v_writelane_b32 v4, s53, 11
+; VI-NEXT:    v_writelane_b32 v4, s54, 12
+; VI-NEXT:    v_writelane_b32 v4, s55, 13
+; VI-NEXT:    v_writelane_b32 v4, s64, 14
+; VI-NEXT:    v_writelane_b32 v4, s65, 15
+; VI-NEXT:    v_writelane_b32 v4, s66, 16
+; VI-NEXT:    v_writelane_b32 v4, s67, 17
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v4, s66, 18
+; VI-NEXT:    v_writelane_b32 v4, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
-; VI-NEXT:    v_writelane_b32 v4, s67, 19
+; VI-NEXT:    v_writelane_b32 v4, s31, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s56, s5, 24
@@ -10667,27 +10667,27 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT:    v_readlane_b32 s30, v4, 18
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s67, v4, 19
-; VI-NEXT:    v_readlane_b32 s66, v4, 18
-; VI-NEXT:    v_readlane_b32 s65, v4, 17
-; VI-NEXT:    v_readlane_b32 s64, v4, 16
-; VI-NEXT:    v_readlane_b32 s55, v4, 15
-; VI-NEXT:    v_readlane_b32 s54, v4, 14
-; VI-NEXT:    v_readlane_b32 s53, v4, 13
-; VI-NEXT:    v_readlane_b32 s52, v4, 12
-; VI-NEXT:    v_readlane_b32 s51, v4, 11
-; VI-NEXT:    v_readlane_b32 s50, v4, 10
-; VI-NEXT:    v_readlane_b32 s49, v4, 9
-; VI-NEXT:    v_readlane_b32 s48, v4, 8
-; VI-NEXT:    v_readlane_b32 s39, v4, 7
-; VI-NEXT:    v_readlane_b32 s38, v4, 6
-; VI-NEXT:    v_readlane_b32 s37, v4, 5
-; VI-NEXT:    v_readlane_b32 s36, v4, 4
-; VI-NEXT:    v_readlane_b32 s35, v4, 3
-; VI-NEXT:    v_readlane_b32 s34, v4, 2
-; VI-NEXT:    v_readlane_b32 s31, v4, 1
-; VI-NEXT:    v_readlane_b32 s30, v4, 0
+; VI-NEXT:    v_readlane_b32 s31, v4, 19
+; VI-NEXT:    v_readlane_b32 s67, v4, 17
+; VI-NEXT:    v_readlane_b32 s66, v4, 16
+; VI-NEXT:    v_readlane_b32 s65, v4, 15
+; VI-NEXT:    v_readlane_b32 s64, v4, 14
+; VI-NEXT:    v_readlane_b32 s55, v4, 13
+; VI-NEXT:    v_readlane_b32 s54, v4, 12
+; VI-NEXT:    v_readlane_b32 s53, v4, 11
+; VI-NEXT:    v_readlane_b32 s52, v4, 10
+; VI-NEXT:    v_readlane_b32 s51, v4, 9
+; VI-NEXT:    v_readlane_b32 s50, v4, 8
+; VI-NEXT:    v_readlane_b32 s49, v4, 7
+; VI-NEXT:    v_readlane_b32 s48, v4, 6
+; VI-NEXT:    v_readlane_b32 s39, v4, 5
+; VI-NEXT:    v_readlane_b32 s38, v4, 4
+; VI-NEXT:    v_readlane_b32 s37, v4, 3
+; VI-NEXT:    v_readlane_b32 s36, v4, 2
+; VI-NEXT:    v_readlane_b32 s35, v4, 1
+; VI-NEXT:    v_readlane_b32 s34, v4, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10750,26 +10750,26 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v4, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v4, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v4, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v4, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v4, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v4, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v4, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v4, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v4, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v4, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v4, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v4, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v4, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v4, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v4, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v4, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v4, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v4, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v4, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v4, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v4, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v4, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v4, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v4, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v4, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v4, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v4, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v4, s55, 13
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v4, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v4, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_writelane_b32 v4, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v4, s31, 15
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s56, s5, 24
@@ -10999,23 +10999,23 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX9-NEXT:    v_perm_b32 v2, s57, v3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_readlane_b32 s30, v4, 14
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    v_readlane_b32 s55, v4, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v4, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v4, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v4, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v4, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v4, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v4, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v4, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v4, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v4, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v4, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v4, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v4, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v4, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v4, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v4, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v4, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v4, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v4, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v4, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v4, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v4, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v4, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v4, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v4, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v4, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v4, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v4, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v4, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -11078,17 +11078,17 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v23, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v23, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v23, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v23, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v23, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v23, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v23, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v23, s48, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 8
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s27, 24
@@ -11210,7 +11210,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX11-NEXT:    v_mov_b32_e32 v12, 0xc0c0004
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v5, s34, s28, v12
-; GFX11-NEXT:    v_readlane_b32 s34, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    v_perm_b32 v2, s39, s40, v12
 ; GFX11-NEXT:    v_perm_b32 v4, s37, s36, v12
@@ -11274,19 +11274,19 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; GFX11-NEXT:    v_or_b32_e32 v14, v19, v17
 ; GFX11-NEXT:    v_or_b32_e32 v15, v21, v18
 ; GFX11-NEXT:    v_or_b32_e32 v16, v22, v20
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 7
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s48, v23, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v23, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v23, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v23, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v23, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v23, 3
-; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 1
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -11359,7 +11359,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
+define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13635,7 +13635,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v16i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15272,7 +15272,7 @@ end:
   ret <16 x i32> %phi
 }
 
-define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) {
+define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15398,7 +15398,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15453,8 +15453,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB29_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15514,8 +15514,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB29_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15575,8 +15575,8 @@ define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB29_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15643,7 +15643,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) {
+define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15781,7 +15781,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v8i64_to_v16f32_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v8i64_to_v16f32_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15982,7 +15982,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) {
+define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16108,7 +16108,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16163,8 +16163,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB33_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16224,8 +16224,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB33_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16285,8 +16285,8 @@ define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB33_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16353,7 +16353,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) {
+define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16455,7 +16455,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16502,8 +16502,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16555,8 +16555,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16608,8 +16608,8 @@ define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB35_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16668,7 +16668,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) {
+define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16893,7 +16893,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17109,8 +17109,8 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB37_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17170,8 +17170,8 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB37_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17238,7 +17238,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
+define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17574,7 +17574,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17941,8 +17941,8 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB39_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18009,7 +18009,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) {
+define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18234,7 +18234,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18450,8 +18450,8 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB41_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18511,8 +18511,8 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB41_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18579,7 +18579,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) {
+define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18979,7 +18979,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19349,8 +19349,8 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB43_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    v_readlane_b32 s30, v17, 0
+; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19411,8 +19411,8 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB43_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19479,7 +19479,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19784,7 +19784,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20064,8 +20064,8 @@ define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB45_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20125,8 +20125,8 @@ define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB45_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20193,7 +20193,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
+define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21599,7 +21599,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22179,8 +22179,8 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22530,8 +22530,8 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB47_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23245,7 +23245,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
+define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24696,7 +24696,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v16f32_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24704,40 +24704,40 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s30, 28
 ; SI-NEXT:    v_readfirstlane_b32 s37, v2
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s36, v1
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    v_writelane_b32 v40, s31, 29
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s82, s37, 24
@@ -25152,37 +25152,37 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    v_readlane_b32 s30, v40, 28
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 29
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25195,27 +25195,27 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v63, s31, 19
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
@@ -25551,26 +25551,26 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v3, v3, v34, s4
 ; VI-NEXT:    v_perm_b32 v1, v1, v29, s4
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 18
+; VI-NEXT:    v_readlane_b32 s31, v63, 19
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v5, v33, v5, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -25624,23 +25624,23 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -25970,22 +25970,22 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v34, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v5, v33, v5, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -26036,18 +26036,18 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 s42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB49_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s43, s27, 24
@@ -26315,21 +26315,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v17
 ; GFX11-NEXT:    v_or_b32_e32 v3, v19, v15
 ; GFX11-NEXT:    v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -26352,7 +26352,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
+define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28628,7 +28628,7 @@ end:
   ret <16 x float> %phi
 }
 
-define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v16f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30265,7 +30265,7 @@ end:
   ret <16 x float> %phi
 }
 
-define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) {
+define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30403,7 +30403,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v8i64_to_v8f64_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v8i64_to_v8f64_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30603,7 +30603,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) {
+define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30705,7 +30705,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30752,8 +30752,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v14, s30
 ; SI-NEXT:    v_mov_b32_e32 v15, s31
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30805,8 +30805,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30858,8 +30858,8 @@ define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB55_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30918,7 +30918,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) {
+define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31155,7 +31155,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31452,7 +31452,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
+define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31788,7 +31788,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32155,8 +32155,8 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB59_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32223,7 +32223,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) {
+define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32460,7 +32460,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32757,7 +32757,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) {
+define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33157,7 +33157,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33527,8 +33527,8 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB63_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    v_readlane_b32 s30, v17, 0
+; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -33589,8 +33589,8 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB63_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -33657,7 +33657,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33974,7 +33974,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34319,7 +34319,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
+define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35725,7 +35725,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36305,8 +36305,8 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB67_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -36656,8 +36656,8 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB67_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -37371,7 +37371,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
+define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38848,7 +38848,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i64_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38856,40 +38856,40 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v4, s30, 0
-; SI-NEXT:    v_writelane_b32 v4, s31, 1
-; SI-NEXT:    v_writelane_b32 v4, s34, 2
-; SI-NEXT:    v_writelane_b32 v4, s35, 3
-; SI-NEXT:    v_writelane_b32 v4, s36, 4
-; SI-NEXT:    v_writelane_b32 v4, s37, 5
-; SI-NEXT:    v_writelane_b32 v4, s38, 6
-; SI-NEXT:    v_writelane_b32 v4, s39, 7
-; SI-NEXT:    v_writelane_b32 v4, s48, 8
-; SI-NEXT:    v_writelane_b32 v4, s49, 9
-; SI-NEXT:    v_writelane_b32 v4, s50, 10
-; SI-NEXT:    v_writelane_b32 v4, s51, 11
-; SI-NEXT:    v_writelane_b32 v4, s52, 12
-; SI-NEXT:    v_writelane_b32 v4, s53, 13
-; SI-NEXT:    v_writelane_b32 v4, s54, 14
-; SI-NEXT:    v_writelane_b32 v4, s55, 15
-; SI-NEXT:    v_writelane_b32 v4, s64, 16
-; SI-NEXT:    v_writelane_b32 v4, s65, 17
-; SI-NEXT:    v_writelane_b32 v4, s66, 18
-; SI-NEXT:    v_writelane_b32 v4, s67, 19
-; SI-NEXT:    v_writelane_b32 v4, s68, 20
-; SI-NEXT:    v_writelane_b32 v4, s69, 21
-; SI-NEXT:    v_writelane_b32 v4, s70, 22
-; SI-NEXT:    v_writelane_b32 v4, s71, 23
-; SI-NEXT:    v_writelane_b32 v4, s80, 24
-; SI-NEXT:    v_writelane_b32 v4, s81, 25
-; SI-NEXT:    v_writelane_b32 v4, s82, 26
-; SI-NEXT:    v_writelane_b32 v4, s83, 27
+; SI-NEXT:    v_writelane_b32 v4, s34, 0
+; SI-NEXT:    v_writelane_b32 v4, s35, 1
+; SI-NEXT:    v_writelane_b32 v4, s36, 2
+; SI-NEXT:    v_writelane_b32 v4, s37, 3
+; SI-NEXT:    v_writelane_b32 v4, s38, 4
+; SI-NEXT:    v_writelane_b32 v4, s39, 5
+; SI-NEXT:    v_writelane_b32 v4, s48, 6
+; SI-NEXT:    v_writelane_b32 v4, s49, 7
+; SI-NEXT:    v_writelane_b32 v4, s50, 8
+; SI-NEXT:    v_writelane_b32 v4, s51, 9
+; SI-NEXT:    v_writelane_b32 v4, s52, 10
+; SI-NEXT:    v_writelane_b32 v4, s53, 11
+; SI-NEXT:    v_writelane_b32 v4, s54, 12
+; SI-NEXT:    v_writelane_b32 v4, s55, 13
+; SI-NEXT:    v_writelane_b32 v4, s64, 14
+; SI-NEXT:    v_writelane_b32 v4, s65, 15
+; SI-NEXT:    v_writelane_b32 v4, s66, 16
+; SI-NEXT:    v_writelane_b32 v4, s67, 17
+; SI-NEXT:    v_writelane_b32 v4, s68, 18
+; SI-NEXT:    v_writelane_b32 v4, s69, 19
+; SI-NEXT:    v_writelane_b32 v4, s70, 20
+; SI-NEXT:    v_writelane_b32 v4, s71, 21
+; SI-NEXT:    v_writelane_b32 v4, s80, 22
+; SI-NEXT:    v_writelane_b32 v4, s81, 23
+; SI-NEXT:    v_writelane_b32 v4, s82, 24
+; SI-NEXT:    v_writelane_b32 v4, s83, 25
+; SI-NEXT:    v_writelane_b32 v4, s84, 26
+; SI-NEXT:    v_writelane_b32 v4, s85, 27
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
-; SI-NEXT:    v_writelane_b32 v4, s84, 28
+; SI-NEXT:    v_writelane_b32 v4, s30, 28
 ; SI-NEXT:    v_readfirstlane_b32 s5, v2
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v1
-; SI-NEXT:    v_writelane_b32 v4, s85, 29
+; SI-NEXT:    v_writelane_b32 v4, s31, 29
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s38, s5, 24
@@ -39210,37 +39210,37 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v4, 28
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s85, v4, 29
-; SI-NEXT:    v_readlane_b32 s84, v4, 28
-; SI-NEXT:    v_readlane_b32 s83, v4, 27
-; SI-NEXT:    v_readlane_b32 s82, v4, 26
-; SI-NEXT:    v_readlane_b32 s81, v4, 25
-; SI-NEXT:    v_readlane_b32 s80, v4, 24
-; SI-NEXT:    v_readlane_b32 s71, v4, 23
-; SI-NEXT:    v_readlane_b32 s70, v4, 22
-; SI-NEXT:    v_readlane_b32 s69, v4, 21
-; SI-NEXT:    v_readlane_b32 s68, v4, 20
-; SI-NEXT:    v_readlane_b32 s67, v4, 19
-; SI-NEXT:    v_readlane_b32 s66, v4, 18
-; SI-NEXT:    v_readlane_b32 s65, v4, 17
-; SI-NEXT:    v_readlane_b32 s64, v4, 16
-; SI-NEXT:    v_readlane_b32 s55, v4, 15
-; SI-NEXT:    v_readlane_b32 s54, v4, 14
-; SI-NEXT:    v_readlane_b32 s53, v4, 13
-; SI-NEXT:    v_readlane_b32 s52, v4, 12
-; SI-NEXT:    v_readlane_b32 s51, v4, 11
-; SI-NEXT:    v_readlane_b32 s50, v4, 10
-; SI-NEXT:    v_readlane_b32 s49, v4, 9
-; SI-NEXT:    v_readlane_b32 s48, v4, 8
-; SI-NEXT:    v_readlane_b32 s39, v4, 7
-; SI-NEXT:    v_readlane_b32 s38, v4, 6
-; SI-NEXT:    v_readlane_b32 s37, v4, 5
-; SI-NEXT:    v_readlane_b32 s36, v4, 4
-; SI-NEXT:    v_readlane_b32 s35, v4, 3
-; SI-NEXT:    v_readlane_b32 s34, v4, 2
-; SI-NEXT:    v_readlane_b32 s31, v4, 1
-; SI-NEXT:    v_readlane_b32 s30, v4, 0
+; SI-NEXT:    v_readlane_b32 s31, v4, 29
+; SI-NEXT:    v_readlane_b32 s85, v4, 27
+; SI-NEXT:    v_readlane_b32 s84, v4, 26
+; SI-NEXT:    v_readlane_b32 s83, v4, 25
+; SI-NEXT:    v_readlane_b32 s82, v4, 24
+; SI-NEXT:    v_readlane_b32 s81, v4, 23
+; SI-NEXT:    v_readlane_b32 s80, v4, 22
+; SI-NEXT:    v_readlane_b32 s71, v4, 21
+; SI-NEXT:    v_readlane_b32 s70, v4, 20
+; SI-NEXT:    v_readlane_b32 s69, v4, 19
+; SI-NEXT:    v_readlane_b32 s68, v4, 18
+; SI-NEXT:    v_readlane_b32 s67, v4, 17
+; SI-NEXT:    v_readlane_b32 s66, v4, 16
+; SI-NEXT:    v_readlane_b32 s65, v4, 15
+; SI-NEXT:    v_readlane_b32 s64, v4, 14
+; SI-NEXT:    v_readlane_b32 s55, v4, 13
+; SI-NEXT:    v_readlane_b32 s54, v4, 12
+; SI-NEXT:    v_readlane_b32 s53, v4, 11
+; SI-NEXT:    v_readlane_b32 s52, v4, 10
+; SI-NEXT:    v_readlane_b32 s51, v4, 9
+; SI-NEXT:    v_readlane_b32 s50, v4, 8
+; SI-NEXT:    v_readlane_b32 s49, v4, 7
+; SI-NEXT:    v_readlane_b32 s48, v4, 6
+; SI-NEXT:    v_readlane_b32 s39, v4, 5
+; SI-NEXT:    v_readlane_b32 s38, v4, 4
+; SI-NEXT:    v_readlane_b32 s37, v4, 3
+; SI-NEXT:    v_readlane_b32 s36, v4, 2
+; SI-NEXT:    v_readlane_b32 s35, v4, 1
+; SI-NEXT:    v_readlane_b32 s34, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39303,30 +39303,30 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v4, s30, 0
-; VI-NEXT:    v_writelane_b32 v4, s31, 1
-; VI-NEXT:    v_writelane_b32 v4, s34, 2
-; VI-NEXT:    v_writelane_b32 v4, s35, 3
-; VI-NEXT:    v_writelane_b32 v4, s36, 4
-; VI-NEXT:    v_writelane_b32 v4, s37, 5
-; VI-NEXT:    v_writelane_b32 v4, s38, 6
-; VI-NEXT:    v_writelane_b32 v4, s39, 7
-; VI-NEXT:    v_writelane_b32 v4, s48, 8
-; VI-NEXT:    v_writelane_b32 v4, s49, 9
-; VI-NEXT:    v_writelane_b32 v4, s50, 10
-; VI-NEXT:    v_writelane_b32 v4, s51, 11
-; VI-NEXT:    v_writelane_b32 v4, s52, 12
-; VI-NEXT:    v_writelane_b32 v4, s53, 13
-; VI-NEXT:    v_writelane_b32 v4, s54, 14
-; VI-NEXT:    v_writelane_b32 v4, s55, 15
-; VI-NEXT:    v_writelane_b32 v4, s64, 16
-; VI-NEXT:    v_writelane_b32 v4, s65, 17
+; VI-NEXT:    v_writelane_b32 v4, s34, 0
+; VI-NEXT:    v_writelane_b32 v4, s35, 1
+; VI-NEXT:    v_writelane_b32 v4, s36, 2
+; VI-NEXT:    v_writelane_b32 v4, s37, 3
+; VI-NEXT:    v_writelane_b32 v4, s38, 4
+; VI-NEXT:    v_writelane_b32 v4, s39, 5
+; VI-NEXT:    v_writelane_b32 v4, s48, 6
+; VI-NEXT:    v_writelane_b32 v4, s49, 7
+; VI-NEXT:    v_writelane_b32 v4, s50, 8
+; VI-NEXT:    v_writelane_b32 v4, s51, 9
+; VI-NEXT:    v_writelane_b32 v4, s52, 10
+; VI-NEXT:    v_writelane_b32 v4, s53, 11
+; VI-NEXT:    v_writelane_b32 v4, s54, 12
+; VI-NEXT:    v_writelane_b32 v4, s55, 13
+; VI-NEXT:    v_writelane_b32 v4, s64, 14
+; VI-NEXT:    v_writelane_b32 v4, s65, 15
+; VI-NEXT:    v_writelane_b32 v4, s66, 16
+; VI-NEXT:    v_writelane_b32 v4, s67, 17
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v4, s66, 18
+; VI-NEXT:    v_writelane_b32 v4, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
-; VI-NEXT:    v_writelane_b32 v4, s67, 19
+; VI-NEXT:    v_writelane_b32 v4, s31, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s56, s5, 24
@@ -39571,27 +39571,27 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT:    v_readlane_b32 s30, v4, 18
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s67, v4, 19
-; VI-NEXT:    v_readlane_b32 s66, v4, 18
-; VI-NEXT:    v_readlane_b32 s65, v4, 17
-; VI-NEXT:    v_readlane_b32 s64, v4, 16
-; VI-NEXT:    v_readlane_b32 s55, v4, 15
-; VI-NEXT:    v_readlane_b32 s54, v4, 14
-; VI-NEXT:    v_readlane_b32 s53, v4, 13
-; VI-NEXT:    v_readlane_b32 s52, v4, 12
-; VI-NEXT:    v_readlane_b32 s51, v4, 11
-; VI-NEXT:    v_readlane_b32 s50, v4, 10
-; VI-NEXT:    v_readlane_b32 s49, v4, 9
-; VI-NEXT:    v_readlane_b32 s48, v4, 8
-; VI-NEXT:    v_readlane_b32 s39, v4, 7
-; VI-NEXT:    v_readlane_b32 s38, v4, 6
-; VI-NEXT:    v_readlane_b32 s37, v4, 5
-; VI-NEXT:    v_readlane_b32 s36, v4, 4
-; VI-NEXT:    v_readlane_b32 s35, v4, 3
-; VI-NEXT:    v_readlane_b32 s34, v4, 2
-; VI-NEXT:    v_readlane_b32 s31, v4, 1
-; VI-NEXT:    v_readlane_b32 s30, v4, 0
+; VI-NEXT:    v_readlane_b32 s31, v4, 19
+; VI-NEXT:    v_readlane_b32 s67, v4, 17
+; VI-NEXT:    v_readlane_b32 s66, v4, 16
+; VI-NEXT:    v_readlane_b32 s65, v4, 15
+; VI-NEXT:    v_readlane_b32 s64, v4, 14
+; VI-NEXT:    v_readlane_b32 s55, v4, 13
+; VI-NEXT:    v_readlane_b32 s54, v4, 12
+; VI-NEXT:    v_readlane_b32 s53, v4, 11
+; VI-NEXT:    v_readlane_b32 s52, v4, 10
+; VI-NEXT:    v_readlane_b32 s51, v4, 9
+; VI-NEXT:    v_readlane_b32 s50, v4, 8
+; VI-NEXT:    v_readlane_b32 s49, v4, 7
+; VI-NEXT:    v_readlane_b32 s48, v4, 6
+; VI-NEXT:    v_readlane_b32 s39, v4, 5
+; VI-NEXT:    v_readlane_b32 s38, v4, 4
+; VI-NEXT:    v_readlane_b32 s37, v4, 3
+; VI-NEXT:    v_readlane_b32 s36, v4, 2
+; VI-NEXT:    v_readlane_b32 s35, v4, 1
+; VI-NEXT:    v_readlane_b32 s34, v4, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39654,26 +39654,26 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v4, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v4, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v4, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v4, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v4, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v4, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v4, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v4, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v4, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v4, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v4, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v4, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v4, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v4, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v4, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v4, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v4, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v4, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v4, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v4, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v4, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v4, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v4, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v4, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v4, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v4, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v4, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v4, s55, 13
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v4, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v4, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_writelane_b32 v4, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v4, s31, 15
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s56, s5, 24
@@ -39903,23 +39903,23 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX9-NEXT:    v_perm_b32 v2, s57, v3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_readlane_b32 s30, v4, 14
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    v_readlane_b32 s55, v4, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v4, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v4, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v4, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v4, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v4, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v4, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v4, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v4, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v4, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v4, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v4, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v4, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v4, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v4, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v4, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v4, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v4, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v4, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v4, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v4, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v4, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v4, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v4, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v4, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v4, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v4, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v4, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v4, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39982,17 +39982,17 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v23, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v23, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v23, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v23, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v23, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v23, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v23, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v23, s48, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 8
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s42, s27, 24
@@ -40114,7 +40114,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX11-NEXT:    v_mov_b32_e32 v12, 0xc0c0004
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v5, s34, s28, v12
-; GFX11-NEXT:    v_readlane_b32 s34, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    v_perm_b32 v2, s39, s40, v12
 ; GFX11-NEXT:    v_perm_b32 v4, s37, s36, v12
@@ -40178,19 +40178,19 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; GFX11-NEXT:    v_or_b32_e32 v14, v19, v17
 ; GFX11-NEXT:    v_or_b32_e32 v15, v21, v18
 ; GFX11-NEXT:    v_or_b32_e32 v16, v22, v20
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 7
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s48, v23, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v23, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v23, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v23, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v23, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v23, 3
-; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 1
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -40263,7 +40263,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
+define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42539,7 +42539,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v8i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44176,7 +44176,7 @@ end:
   ret <8 x i64> %phi
 }
 
-define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) {
+define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44377,7 +44377,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44577,8 +44577,8 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB73_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -44630,8 +44630,8 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB73_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -44690,7 +44690,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
+define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45026,7 +45026,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45393,8 +45393,8 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB75_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45461,7 +45461,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) {
+define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45662,7 +45662,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45862,8 +45862,8 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB77_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45915,8 +45915,8 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB77_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45975,7 +45975,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) {
+define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46375,7 +46375,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46745,8 +46745,8 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB79_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    v_readlane_b32 s30, v17, 0
+; VI-NEXT:    v_readlane_b32 s31, v17, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -46807,8 +46807,8 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB79_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -46875,7 +46875,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47148,7 +47148,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47412,8 +47412,8 @@ define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB81_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    v_readlane_b32 s30, v16, 0
+; VI-NEXT:    v_readlane_b32 s31, v16, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -47465,8 +47465,8 @@ define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB81_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -47525,7 +47525,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
+define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -48931,7 +48931,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49511,8 +49511,8 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB83_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -49862,8 +49862,8 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB83_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -50577,7 +50577,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
+define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52004,7 +52004,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8f64_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52012,42 +52012,42 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
+; SI-NEXT:    v_writelane_b32 v40, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s5, v2
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v1
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
+; SI-NEXT:    v_writelane_b32 v40, s31, 31
 ; SI-NEXT:    s_cbranch_scc0 .LBB85_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s5, 24
@@ -52446,39 +52446,39 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v40, 30
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 31
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -52491,27 +52491,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v63, s31, 19
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
@@ -52832,26 +52832,26 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v3, v3, v36, s4
 ; VI-NEXT:    v_perm_b32 v1, v1, v31, s4
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 18
+; VI-NEXT:    v_readlane_b32 s31, v63, 19
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v5, v35, v5, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -52920,23 +52920,23 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -53254,22 +53254,22 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v36, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v31, s4
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v5, v35, v5, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -53333,18 +53333,18 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 s42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB85_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s90, s27, 24
@@ -53604,21 +53604,21 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v8
 ; GFX11-NEXT:    v_or_b32_e32 v3, v10, v9
 ; GFX11-NEXT:    v_or_b32_e32 v4, v12, v11
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[84:87], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[15:18], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -53641,7 +53641,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
+define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -55917,7 +55917,7 @@ end:
   ret <8 x double> %phi
 }
 
-define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v8f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -57554,7 +57554,7 @@ end:
   ret <8 x double> %phi
 }
 
-define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) {
+define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58002,7 +58002,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58010,16 +58010,16 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v16, s30, 0
-; SI-NEXT:    v_writelane_b32 v16, s31, 1
-; SI-NEXT:    v_writelane_b32 v16, s34, 2
-; SI-NEXT:    v_writelane_b32 v16, s35, 3
-; SI-NEXT:    v_writelane_b32 v16, s36, 4
-; SI-NEXT:    v_writelane_b32 v16, s37, 5
-; SI-NEXT:    v_writelane_b32 v16, s38, 6
-; SI-NEXT:    v_writelane_b32 v16, s39, 7
+; SI-NEXT:    v_writelane_b32 v16, s34, 0
+; SI-NEXT:    v_writelane_b32 v16, s35, 1
+; SI-NEXT:    v_writelane_b32 v16, s36, 2
+; SI-NEXT:    v_writelane_b32 v16, s37, 3
+; SI-NEXT:    v_writelane_b32 v16, s38, 4
+; SI-NEXT:    v_writelane_b32 v16, s39, 5
+; SI-NEXT:    v_writelane_b32 v16, s30, 6
 ; SI-NEXT:    v_readfirstlane_b32 s37, v1
 ; SI-NEXT:    v_readfirstlane_b32 s39, v0
+; SI-NEXT:    v_writelane_b32 v16, s31, 7
 ; SI-NEXT:    s_lshr_b32 s92, s29, 16
 ; SI-NEXT:    s_lshr_b32 s36, s28, 16
 ; SI-NEXT:    s_lshr_b32 s91, s27, 16
@@ -58251,6 +58251,7 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s19, s41, 0xffff
 ; SI-NEXT:    s_lshl_b32 s20, s93, 16
 ; SI-NEXT:    s_or_b32 s19, s19, s20
+; SI-NEXT:    v_readlane_b32 s30, v16, 6
 ; SI-NEXT:    v_mov_b32_e32 v0, s14
 ; SI-NEXT:    v_mov_b32_e32 v1, s15
 ; SI-NEXT:    v_mov_b32_e32 v2, s12
@@ -58267,14 +58268,13 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v13, s17
 ; SI-NEXT:    v_mov_b32_e32 v14, s18
 ; SI-NEXT:    v_mov_b32_e32 v15, s19
-; SI-NEXT:    v_readlane_b32 s39, v16, 7
-; SI-NEXT:    v_readlane_b32 s38, v16, 6
-; SI-NEXT:    v_readlane_b32 s37, v16, 5
-; SI-NEXT:    v_readlane_b32 s36, v16, 4
-; SI-NEXT:    v_readlane_b32 s35, v16, 3
-; SI-NEXT:    v_readlane_b32 s34, v16, 2
-; SI-NEXT:    v_readlane_b32 s31, v16, 1
-; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    v_readlane_b32 s31, v16, 7
+; SI-NEXT:    v_readlane_b32 s39, v16, 5
+; SI-NEXT:    v_readlane_b32 s38, v16, 4
+; SI-NEXT:    v_readlane_b32 s37, v16, 3
+; SI-NEXT:    v_readlane_b32 s36, v16, 2
+; SI-NEXT:    v_readlane_b32 s35, v16, 1
+; SI-NEXT:    v_readlane_b32 s34, v16, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -58464,8 +58464,8 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB89_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -58532,7 +58532,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) {
+define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58884,7 +58884,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59253,8 +59253,8 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB91_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v18, 1
 ; VI-NEXT:    v_readlane_b32 s30, v18, 0
+; VI-NEXT:    v_readlane_b32 s31, v18, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -59315,8 +59315,8 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB91_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -59383,7 +59383,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59778,7 +59778,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59786,16 +59786,16 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
 ; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v17, s30, 0
-; SI-NEXT:    v_writelane_b32 v17, s31, 1
-; SI-NEXT:    v_writelane_b32 v17, s34, 2
-; SI-NEXT:    v_writelane_b32 v17, s35, 3
-; SI-NEXT:    v_writelane_b32 v17, s36, 4
-; SI-NEXT:    v_writelane_b32 v17, s37, 5
-; SI-NEXT:    v_writelane_b32 v17, s38, 6
-; SI-NEXT:    v_writelane_b32 v17, s39, 7
+; SI-NEXT:    v_writelane_b32 v17, s34, 0
+; SI-NEXT:    v_writelane_b32 v17, s35, 1
+; SI-NEXT:    v_writelane_b32 v17, s36, 2
+; SI-NEXT:    v_writelane_b32 v17, s37, 3
+; SI-NEXT:    v_writelane_b32 v17, s38, 4
+; SI-NEXT:    v_writelane_b32 v17, s39, 5
+; SI-NEXT:    v_writelane_b32 v17, s30, 6
 ; SI-NEXT:    v_readfirstlane_b32 s39, v1
 ; SI-NEXT:    v_readfirstlane_b32 s37, v0
+; SI-NEXT:    v_writelane_b32 v17, s31, 7
 ; SI-NEXT:    s_lshr_b32 s35, s29, 16
 ; SI-NEXT:    s_lshr_b32 s34, s28, 16
 ; SI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -60026,15 +60026,15 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s9
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s6
+; SI-NEXT:    v_readlane_b32 s30, v17, 6
 ; SI-NEXT:    v_lshr_b64 v[15:16], v[15:16], 16
-; SI-NEXT:    v_readlane_b32 s39, v17, 7
-; SI-NEXT:    v_readlane_b32 s38, v17, 6
-; SI-NEXT:    v_readlane_b32 s37, v17, 5
-; SI-NEXT:    v_readlane_b32 s36, v17, 4
-; SI-NEXT:    v_readlane_b32 s35, v17, 3
-; SI-NEXT:    v_readlane_b32 s34, v17, 2
-; SI-NEXT:    v_readlane_b32 s31, v17, 1
-; SI-NEXT:    v_readlane_b32 s30, v17, 0
+; SI-NEXT:    v_readlane_b32 s31, v17, 7
+; SI-NEXT:    v_readlane_b32 s39, v17, 5
+; SI-NEXT:    v_readlane_b32 s38, v17, 4
+; SI-NEXT:    v_readlane_b32 s37, v17, 3
+; SI-NEXT:    v_readlane_b32 s36, v17, 2
+; SI-NEXT:    v_readlane_b32 s35, v17, 1
+; SI-NEXT:    v_readlane_b32 s34, v17, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -60240,8 +60240,8 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB93_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -60308,7 +60308,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
+define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -61782,7 +61782,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -62454,8 +62454,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB95_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -62789,8 +62789,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB95_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -63428,7 +63428,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
+define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -65392,7 +65392,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32i16_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -65401,43 +65401,42 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v4, s30, 0
-; SI-NEXT:    v_writelane_b32 v4, s31, 1
-; SI-NEXT:    v_writelane_b32 v4, s34, 2
-; SI-NEXT:    v_writelane_b32 v4, s35, 3
-; SI-NEXT:    v_writelane_b32 v4, s36, 4
-; SI-NEXT:    v_writelane_b32 v4, s37, 5
-; SI-NEXT:    v_writelane_b32 v4, s38, 6
-; SI-NEXT:    v_writelane_b32 v4, s39, 7
-; SI-NEXT:    v_writelane_b32 v4, s48, 8
-; SI-NEXT:    v_writelane_b32 v4, s49, 9
-; SI-NEXT:    v_writelane_b32 v4, s50, 10
-; SI-NEXT:    v_writelane_b32 v4, s51, 11
-; SI-NEXT:    v_writelane_b32 v4, s52, 12
-; SI-NEXT:    v_writelane_b32 v4, s53, 13
-; SI-NEXT:    v_writelane_b32 v4, s54, 14
-; SI-NEXT:    v_writelane_b32 v4, s55, 15
-; SI-NEXT:    v_writelane_b32 v4, s64, 16
-; SI-NEXT:    v_writelane_b32 v4, s65, 17
-; SI-NEXT:    v_writelane_b32 v4, s66, 18
-; SI-NEXT:    v_writelane_b32 v4, s67, 19
-; SI-NEXT:    v_writelane_b32 v4, s68, 20
-; SI-NEXT:    v_writelane_b32 v4, s69, 21
-; SI-NEXT:    v_writelane_b32 v4, s70, 22
-; SI-NEXT:    v_writelane_b32 v4, s71, 23
-; SI-NEXT:    v_writelane_b32 v4, s80, 24
-; SI-NEXT:    v_writelane_b32 v4, s81, 25
-; SI-NEXT:    v_writelane_b32 v4, s82, 26
-; SI-NEXT:    v_writelane_b32 v4, s83, 27
-; SI-NEXT:    v_writelane_b32 v4, s84, 28
-; SI-NEXT:    v_writelane_b32 v4, s85, 29
-; SI-NEXT:    v_writelane_b32 v4, s86, 30
-; SI-NEXT:    v_writelane_b32 v4, s87, 31
-; SI-NEXT:    v_writelane_b32 v4, s96, 32
-; SI-NEXT:    v_writelane_b32 v4, s97, 33
+; SI-NEXT:    v_writelane_b32 v4, s34, 0
+; SI-NEXT:    v_writelane_b32 v4, s35, 1
+; SI-NEXT:    v_writelane_b32 v4, s36, 2
+; SI-NEXT:    v_writelane_b32 v4, s37, 3
+; SI-NEXT:    v_writelane_b32 v4, s38, 4
+; SI-NEXT:    v_writelane_b32 v4, s39, 5
+; SI-NEXT:    v_writelane_b32 v4, s48, 6
+; SI-NEXT:    v_writelane_b32 v4, s49, 7
+; SI-NEXT:    v_writelane_b32 v4, s50, 8
+; SI-NEXT:    v_writelane_b32 v4, s51, 9
+; SI-NEXT:    v_writelane_b32 v4, s52, 10
+; SI-NEXT:    v_writelane_b32 v4, s53, 11
+; SI-NEXT:    v_writelane_b32 v4, s54, 12
+; SI-NEXT:    v_writelane_b32 v4, s55, 13
+; SI-NEXT:    v_writelane_b32 v4, s64, 14
+; SI-NEXT:    v_writelane_b32 v4, s65, 15
+; SI-NEXT:    v_writelane_b32 v4, s66, 16
+; SI-NEXT:    v_writelane_b32 v4, s67, 17
+; SI-NEXT:    v_writelane_b32 v4, s68, 18
+; SI-NEXT:    v_writelane_b32 v4, s69, 19
+; SI-NEXT:    v_writelane_b32 v4, s70, 20
+; SI-NEXT:    v_writelane_b32 v4, s71, 21
+; SI-NEXT:    v_writelane_b32 v4, s80, 22
+; SI-NEXT:    v_writelane_b32 v4, s81, 23
+; SI-NEXT:    v_writelane_b32 v4, s82, 24
+; SI-NEXT:    v_writelane_b32 v4, s83, 25
+; SI-NEXT:    v_writelane_b32 v4, s84, 26
+; SI-NEXT:    v_writelane_b32 v4, s85, 27
+; SI-NEXT:    v_writelane_b32 v4, s86, 28
+; SI-NEXT:    v_writelane_b32 v4, s87, 29
+; SI-NEXT:    v_writelane_b32 v4, s96, 30
+; SI-NEXT:    v_writelane_b32 v4, s97, 31
+; SI-NEXT:    v_writelane_b32 v4, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s56, v2
 ; SI-NEXT:    v_readfirstlane_b32 s60, v1
-; SI-NEXT:    v_writelane_b32 v4, s98, 34
+; SI-NEXT:    v_writelane_b32 v4, s99, 33
 ; SI-NEXT:    s_lshr_b32 s68, s29, 16
 ; SI-NEXT:    s_lshr_b32 s46, s28, 16
 ; SI-NEXT:    s_lshr_b32 s70, s27, 16
@@ -65455,8 +65454,9 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; SI-NEXT:    s_lshr_b32 s69, s56, 16
 ; SI-NEXT:    s_lshr_b32 s58, s60, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    v_writelane_b32 v4, s30, 34
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v4, s99, 35
+; SI-NEXT:    v_writelane_b32 v4, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -65913,44 +65913,44 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s30, v4, 34
 ; SI-NEXT:    v_readlane_b32 s19, v5, 9
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v4, 35
-; SI-NEXT:    v_readlane_b32 s98, v4, 34
-; SI-NEXT:    v_readlane_b32 s97, v4, 33
-; SI-NEXT:    v_readlane_b32 s96, v4, 32
-; SI-NEXT:    v_readlane_b32 s87, v4, 31
-; SI-NEXT:    v_readlane_b32 s86, v4, 30
-; SI-NEXT:    v_readlane_b32 s85, v4, 29
-; SI-NEXT:    v_readlane_b32 s84, v4, 28
-; SI-NEXT:    v_readlane_b32 s83, v4, 27
-; SI-NEXT:    v_readlane_b32 s82, v4, 26
-; SI-NEXT:    v_readlane_b32 s81, v4, 25
-; SI-NEXT:    v_readlane_b32 s80, v4, 24
-; SI-NEXT:    v_readlane_b32 s71, v4, 23
-; SI-NEXT:    v_readlane_b32 s70, v4, 22
-; SI-NEXT:    v_readlane_b32 s69, v4, 21
-; SI-NEXT:    v_readlane_b32 s68, v4, 20
-; SI-NEXT:    v_readlane_b32 s67, v4, 19
-; SI-NEXT:    v_readlane_b32 s66, v4, 18
-; SI-NEXT:    v_readlane_b32 s65, v4, 17
-; SI-NEXT:    v_readlane_b32 s64, v4, 16
-; SI-NEXT:    v_readlane_b32 s55, v4, 15
-; SI-NEXT:    v_readlane_b32 s54, v4, 14
-; SI-NEXT:    v_readlane_b32 s53, v4, 13
-; SI-NEXT:    v_readlane_b32 s52, v4, 12
-; SI-NEXT:    v_readlane_b32 s51, v4, 11
-; SI-NEXT:    v_readlane_b32 s50, v4, 10
-; SI-NEXT:    v_readlane_b32 s49, v4, 9
-; SI-NEXT:    v_readlane_b32 s48, v4, 8
-; SI-NEXT:    v_readlane_b32 s39, v4, 7
-; SI-NEXT:    v_readlane_b32 s38, v4, 6
-; SI-NEXT:    v_readlane_b32 s37, v4, 5
-; SI-NEXT:    v_readlane_b32 s36, v4, 4
-; SI-NEXT:    v_readlane_b32 s35, v4, 3
-; SI-NEXT:    v_readlane_b32 s34, v4, 2
-; SI-NEXT:    v_readlane_b32 s31, v4, 1
-; SI-NEXT:    v_readlane_b32 s30, v4, 0
+; SI-NEXT:    v_readlane_b32 s31, v4, 35
+; SI-NEXT:    v_readlane_b32 s99, v4, 33
+; SI-NEXT:    v_readlane_b32 s98, v4, 32
+; SI-NEXT:    v_readlane_b32 s97, v4, 31
+; SI-NEXT:    v_readlane_b32 s96, v4, 30
+; SI-NEXT:    v_readlane_b32 s87, v4, 29
+; SI-NEXT:    v_readlane_b32 s86, v4, 28
+; SI-NEXT:    v_readlane_b32 s85, v4, 27
+; SI-NEXT:    v_readlane_b32 s84, v4, 26
+; SI-NEXT:    v_readlane_b32 s83, v4, 25
+; SI-NEXT:    v_readlane_b32 s82, v4, 24
+; SI-NEXT:    v_readlane_b32 s81, v4, 23
+; SI-NEXT:    v_readlane_b32 s80, v4, 22
+; SI-NEXT:    v_readlane_b32 s71, v4, 21
+; SI-NEXT:    v_readlane_b32 s70, v4, 20
+; SI-NEXT:    v_readlane_b32 s69, v4, 19
+; SI-NEXT:    v_readlane_b32 s68, v4, 18
+; SI-NEXT:    v_readlane_b32 s67, v4, 17
+; SI-NEXT:    v_readlane_b32 s66, v4, 16
+; SI-NEXT:    v_readlane_b32 s65, v4, 15
+; SI-NEXT:    v_readlane_b32 s64, v4, 14
+; SI-NEXT:    v_readlane_b32 s55, v4, 13
+; SI-NEXT:    v_readlane_b32 s54, v4, 12
+; SI-NEXT:    v_readlane_b32 s53, v4, 11
+; SI-NEXT:    v_readlane_b32 s52, v4, 10
+; SI-NEXT:    v_readlane_b32 s51, v4, 9
+; SI-NEXT:    v_readlane_b32 s50, v4, 8
+; SI-NEXT:    v_readlane_b32 s49, v4, 7
+; SI-NEXT:    v_readlane_b32 s48, v4, 6
+; SI-NEXT:    v_readlane_b32 s39, v4, 5
+; SI-NEXT:    v_readlane_b32 s38, v4, 4
+; SI-NEXT:    v_readlane_b32 s37, v4, 3
+; SI-NEXT:    v_readlane_b32 s36, v4, 2
+; SI-NEXT:    v_readlane_b32 s35, v4, 1
+; SI-NEXT:    v_readlane_b32 s34, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -66025,30 +66025,30 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v4, s30, 0
-; VI-NEXT:    v_writelane_b32 v4, s31, 1
-; VI-NEXT:    v_writelane_b32 v4, s34, 2
-; VI-NEXT:    v_writelane_b32 v4, s35, 3
-; VI-NEXT:    v_writelane_b32 v4, s36, 4
-; VI-NEXT:    v_writelane_b32 v4, s37, 5
-; VI-NEXT:    v_writelane_b32 v4, s38, 6
-; VI-NEXT:    v_writelane_b32 v4, s39, 7
-; VI-NEXT:    v_writelane_b32 v4, s48, 8
-; VI-NEXT:    v_writelane_b32 v4, s49, 9
-; VI-NEXT:    v_writelane_b32 v4, s50, 10
-; VI-NEXT:    v_writelane_b32 v4, s51, 11
-; VI-NEXT:    v_writelane_b32 v4, s52, 12
-; VI-NEXT:    v_writelane_b32 v4, s53, 13
-; VI-NEXT:    v_writelane_b32 v4, s54, 14
-; VI-NEXT:    v_writelane_b32 v4, s55, 15
-; VI-NEXT:    v_writelane_b32 v4, s64, 16
-; VI-NEXT:    v_writelane_b32 v4, s65, 17
+; VI-NEXT:    v_writelane_b32 v4, s34, 0
+; VI-NEXT:    v_writelane_b32 v4, s35, 1
+; VI-NEXT:    v_writelane_b32 v4, s36, 2
+; VI-NEXT:    v_writelane_b32 v4, s37, 3
+; VI-NEXT:    v_writelane_b32 v4, s38, 4
+; VI-NEXT:    v_writelane_b32 v4, s39, 5
+; VI-NEXT:    v_writelane_b32 v4, s48, 6
+; VI-NEXT:    v_writelane_b32 v4, s49, 7
+; VI-NEXT:    v_writelane_b32 v4, s50, 8
+; VI-NEXT:    v_writelane_b32 v4, s51, 9
+; VI-NEXT:    v_writelane_b32 v4, s52, 10
+; VI-NEXT:    v_writelane_b32 v4, s53, 11
+; VI-NEXT:    v_writelane_b32 v4, s54, 12
+; VI-NEXT:    v_writelane_b32 v4, s55, 13
+; VI-NEXT:    v_writelane_b32 v4, s64, 14
+; VI-NEXT:    v_writelane_b32 v4, s65, 15
+; VI-NEXT:    v_writelane_b32 v4, s66, 16
+; VI-NEXT:    v_writelane_b32 v4, s67, 17
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v4, s66, 18
+; VI-NEXT:    v_writelane_b32 v4, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
-; VI-NEXT:    v_writelane_b32 v4, s67, 19
+; VI-NEXT:    v_writelane_b32 v4, s31, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s56, s5, 24
@@ -66357,27 +66357,27 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT:    v_readlane_b32 s30, v4, 18
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_readlane_b32 s67, v4, 19
-; VI-NEXT:    v_readlane_b32 s66, v4, 18
-; VI-NEXT:    v_readlane_b32 s65, v4, 17
-; VI-NEXT:    v_readlane_b32 s64, v4, 16
-; VI-NEXT:    v_readlane_b32 s55, v4, 15
-; VI-NEXT:    v_readlane_b32 s54, v4, 14
-; VI-NEXT:    v_readlane_b32 s53, v4, 13
-; VI-NEXT:    v_readlane_b32 s52, v4, 12
-; VI-NEXT:    v_readlane_b32 s51, v4, 11
-; VI-NEXT:    v_readlane_b32 s50, v4, 10
-; VI-NEXT:    v_readlane_b32 s49, v4, 9
-; VI-NEXT:    v_readlane_b32 s48, v4, 8
-; VI-NEXT:    v_readlane_b32 s39, v4, 7
-; VI-NEXT:    v_readlane_b32 s38, v4, 6
-; VI-NEXT:    v_readlane_b32 s37, v4, 5
-; VI-NEXT:    v_readlane_b32 s36, v4, 4
-; VI-NEXT:    v_readlane_b32 s35, v4, 3
-; VI-NEXT:    v_readlane_b32 s34, v4, 2
-; VI-NEXT:    v_readlane_b32 s31, v4, 1
-; VI-NEXT:    v_readlane_b32 s30, v4, 0
+; VI-NEXT:    v_readlane_b32 s31, v4, 19
+; VI-NEXT:    v_readlane_b32 s67, v4, 17
+; VI-NEXT:    v_readlane_b32 s66, v4, 16
+; VI-NEXT:    v_readlane_b32 s65, v4, 15
+; VI-NEXT:    v_readlane_b32 s64, v4, 14
+; VI-NEXT:    v_readlane_b32 s55, v4, 13
+; VI-NEXT:    v_readlane_b32 s54, v4, 12
+; VI-NEXT:    v_readlane_b32 s53, v4, 11
+; VI-NEXT:    v_readlane_b32 s52, v4, 10
+; VI-NEXT:    v_readlane_b32 s51, v4, 9
+; VI-NEXT:    v_readlane_b32 s50, v4, 8
+; VI-NEXT:    v_readlane_b32 s49, v4, 7
+; VI-NEXT:    v_readlane_b32 s48, v4, 6
+; VI-NEXT:    v_readlane_b32 s39, v4, 5
+; VI-NEXT:    v_readlane_b32 s38, v4, 4
+; VI-NEXT:    v_readlane_b32 s37, v4, 3
+; VI-NEXT:    v_readlane_b32 s36, v4, 2
+; VI-NEXT:    v_readlane_b32 s35, v4, 1
+; VI-NEXT:    v_readlane_b32 s34, v4, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -66440,23 +66440,23 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -66786,22 +66786,22 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v34, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v5, v33, v5, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -66852,18 +66852,18 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 s42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB97_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s43, s27, 24
@@ -67131,21 +67131,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v17
 ; GFX11-NEXT:    v_or_b32_e32 v3, v19, v15
 ; GFX11-NEXT:    v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -67168,7 +67168,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
+define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -69647,7 +69647,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -69656,15 +69656,44 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s36, v28
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s89, v30
 ; SI-NEXT:    v_readfirstlane_b32 s90, v29
 ; SI-NEXT:    v_readfirstlane_b32 s88, v27
@@ -69690,43 +69719,14 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    s_mov_b32 s44, s29
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v41, s28, 0
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s23, 1
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s22, 2
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
 ; SI-NEXT:    v_writelane_b32 v41, s21, 3
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s20, 4
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
-; SI-NEXT:    s_mov_b32 s44, s29
 ; SI-NEXT:    s_mov_b32 s28, s25
 ; SI-NEXT:    s_mov_b32 s29, s24
 ; SI-NEXT:    v_readfirstlane_b32 s94, v23
@@ -70365,6 +70365,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s20, s45, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s20
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -70381,42 +70382,41 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v13, s7
 ; SI-NEXT:    v_mov_b32_e32 v14, s4
 ; SI-NEXT:    v_mov_b32_e32 v15, s5
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -71635,7 +71635,7 @@ end:
   ret <32 x i16> %phi
 }
 
-define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -72128,7 +72128,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -72136,14 +72136,14 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
 ; SI-NEXT:    v_readfirstlane_b32 s60, v1
 ; SI-NEXT:    v_readfirstlane_b32 s56, v0
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
 ; SI-NEXT:    s_lshr_b32 s43, s29, 16
 ; SI-NEXT:    s_lshr_b32 s42, s28, 16
 ; SI-NEXT:    s_lshr_b32 s41, s27, 16
@@ -72161,9 +72161,9 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; SI-NEXT:    s_lshr_b32 s63, s60, 16
 ; SI-NEXT:    s_lshr_b32 s59, s56, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s30, 6
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s31, 7
 ; SI-NEXT:    s_cbranch_scc0 .LBB101_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshl_b32 s44, s16, 16
@@ -72460,15 +72460,15 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; SI-NEXT:    v_lshr_b64 v[12:13], v[36:37], 16
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[38:39], 16
 ; SI-NEXT:    v_lshr_b64 v[14:15], v[48:49], 16
+; SI-NEXT:    v_readlane_b32 s30, v40, 6
 ; SI-NEXT:    v_lshr_b64 v[15:16], v[50:51], 16
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 7
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -72593,8 +72593,8 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB101_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v18, 1
 ; VI-NEXT:    v_readlane_b32 s30, v18, 0
+; VI-NEXT:    v_readlane_b32 s31, v18, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -72655,8 +72655,8 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB101_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v16, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -72723,7 +72723,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) {
+define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -74236,7 +74236,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -75135,8 +75135,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB103_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -75486,8 +75486,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s31
 ; GFX9-NEXT:  .LBB103_5: ; %end
-; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v20, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v20, 1
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -76192,7 +76192,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
+define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -78024,7 +78024,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32f16_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -78033,44 +78033,42 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v18, s30, 0
-; SI-NEXT:    v_writelane_b32 v18, s31, 1
-; SI-NEXT:    v_writelane_b32 v18, s34, 2
-; SI-NEXT:    v_writelane_b32 v18, s35, 3
-; SI-NEXT:    v_writelane_b32 v18, s36, 4
-; SI-NEXT:    v_writelane_b32 v18, s37, 5
-; SI-NEXT:    v_writelane_b32 v18, s38, 6
-; SI-NEXT:    v_writelane_b32 v18, s39, 7
-; SI-NEXT:    v_writelane_b32 v18, s48, 8
-; SI-NEXT:    v_writelane_b32 v18, s49, 9
-; SI-NEXT:    v_writelane_b32 v18, s50, 10
-; SI-NEXT:    v_writelane_b32 v18, s51, 11
-; SI-NEXT:    v_writelane_b32 v18, s52, 12
-; SI-NEXT:    v_writelane_b32 v18, s53, 13
-; SI-NEXT:    v_writelane_b32 v18, s54, 14
-; SI-NEXT:    v_writelane_b32 v18, s55, 15
-; SI-NEXT:    v_writelane_b32 v18, s64, 16
-; SI-NEXT:    v_writelane_b32 v18, s65, 17
-; SI-NEXT:    v_writelane_b32 v18, s66, 18
-; SI-NEXT:    v_writelane_b32 v18, s67, 19
-; SI-NEXT:    v_writelane_b32 v18, s68, 20
-; SI-NEXT:    v_writelane_b32 v18, s69, 21
-; SI-NEXT:    v_writelane_b32 v18, s70, 22
-; SI-NEXT:    v_writelane_b32 v18, s71, 23
-; SI-NEXT:    v_writelane_b32 v18, s80, 24
-; SI-NEXT:    v_writelane_b32 v18, s81, 25
-; SI-NEXT:    v_writelane_b32 v18, s82, 26
-; SI-NEXT:    v_writelane_b32 v18, s83, 27
-; SI-NEXT:    v_writelane_b32 v18, s84, 28
-; SI-NEXT:    v_writelane_b32 v18, s85, 29
-; SI-NEXT:    v_writelane_b32 v18, s86, 30
-; SI-NEXT:    v_writelane_b32 v18, s87, 31
-; SI-NEXT:    v_writelane_b32 v18, s96, 32
-; SI-NEXT:    v_writelane_b32 v18, s97, 33
-; SI-NEXT:    v_writelane_b32 v18, s98, 34
+; SI-NEXT:    v_writelane_b32 v18, s34, 0
+; SI-NEXT:    v_writelane_b32 v18, s35, 1
+; SI-NEXT:    v_writelane_b32 v18, s36, 2
+; SI-NEXT:    v_writelane_b32 v18, s37, 3
+; SI-NEXT:    v_writelane_b32 v18, s38, 4
+; SI-NEXT:    v_writelane_b32 v18, s39, 5
+; SI-NEXT:    v_writelane_b32 v18, s48, 6
+; SI-NEXT:    v_writelane_b32 v18, s49, 7
+; SI-NEXT:    v_writelane_b32 v18, s50, 8
+; SI-NEXT:    v_writelane_b32 v18, s51, 9
+; SI-NEXT:    v_writelane_b32 v18, s52, 10
+; SI-NEXT:    v_writelane_b32 v18, s53, 11
+; SI-NEXT:    v_writelane_b32 v18, s54, 12
+; SI-NEXT:    v_writelane_b32 v18, s55, 13
+; SI-NEXT:    v_writelane_b32 v18, s64, 14
+; SI-NEXT:    v_writelane_b32 v18, s65, 15
+; SI-NEXT:    v_writelane_b32 v18, s66, 16
+; SI-NEXT:    v_writelane_b32 v18, s67, 17
+; SI-NEXT:    v_writelane_b32 v18, s68, 18
+; SI-NEXT:    v_writelane_b32 v18, s69, 19
+; SI-NEXT:    v_writelane_b32 v18, s70, 20
+; SI-NEXT:    v_writelane_b32 v18, s71, 21
+; SI-NEXT:    v_writelane_b32 v18, s80, 22
+; SI-NEXT:    v_writelane_b32 v18, s81, 23
+; SI-NEXT:    v_writelane_b32 v18, s82, 24
+; SI-NEXT:    v_writelane_b32 v18, s83, 25
+; SI-NEXT:    v_writelane_b32 v18, s84, 26
+; SI-NEXT:    v_writelane_b32 v18, s85, 27
+; SI-NEXT:    v_writelane_b32 v18, s86, 28
+; SI-NEXT:    v_writelane_b32 v18, s87, 29
+; SI-NEXT:    v_writelane_b32 v18, s96, 30
+; SI-NEXT:    v_writelane_b32 v18, s97, 31
+; SI-NEXT:    v_writelane_b32 v18, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s98, v2
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
-; SI-NEXT:    v_writelane_b32 v18, s99, 35
+; SI-NEXT:    v_writelane_b32 v18, s99, 33
 ; SI-NEXT:    s_lshr_b32 s96, s29, 16
 ; SI-NEXT:    s_lshr_b32 s97, s28, 16
 ; SI-NEXT:    s_lshr_b32 s86, s27, 16
@@ -78088,7 +78086,9 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; SI-NEXT:    s_lshr_b32 s99, s98, 16
 ; SI-NEXT:    s_lshr_b32 s46, s44, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    v_writelane_b32 v18, s30, 34
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-NEXT:    v_writelane_b32 v18, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB105_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -78672,43 +78672,43 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    v_readlane_b32 s30, v18, 34
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v18, 35
-; SI-NEXT:    v_readlane_b32 s98, v18, 34
-; SI-NEXT:    v_readlane_b32 s97, v18, 33
-; SI-NEXT:    v_readlane_b32 s96, v18, 32
-; SI-NEXT:    v_readlane_b32 s87, v18, 31
-; SI-NEXT:    v_readlane_b32 s86, v18, 30
-; SI-NEXT:    v_readlane_b32 s85, v18, 29
-; SI-NEXT:    v_readlane_b32 s84, v18, 28
-; SI-NEXT:    v_readlane_b32 s83, v18, 27
-; SI-NEXT:    v_readlane_b32 s82, v18, 26
-; SI-NEXT:    v_readlane_b32 s81, v18, 25
-; SI-NEXT:    v_readlane_b32 s80, v18, 24
-; SI-NEXT:    v_readlane_b32 s71, v18, 23
-; SI-NEXT:    v_readlane_b32 s70, v18, 22
-; SI-NEXT:    v_readlane_b32 s69, v18, 21
-; SI-NEXT:    v_readlane_b32 s68, v18, 20
-; SI-NEXT:    v_readlane_b32 s67, v18, 19
-; SI-NEXT:    v_readlane_b32 s66, v18, 18
-; SI-NEXT:    v_readlane_b32 s65, v18, 17
-; SI-NEXT:    v_readlane_b32 s64, v18, 16
-; SI-NEXT:    v_readlane_b32 s55, v18, 15
-; SI-NEXT:    v_readlane_b32 s54, v18, 14
-; SI-NEXT:    v_readlane_b32 s53, v18, 13
-; SI-NEXT:    v_readlane_b32 s52, v18, 12
-; SI-NEXT:    v_readlane_b32 s51, v18, 11
-; SI-NEXT:    v_readlane_b32 s50, v18, 10
-; SI-NEXT:    v_readlane_b32 s49, v18, 9
-; SI-NEXT:    v_readlane_b32 s48, v18, 8
-; SI-NEXT:    v_readlane_b32 s39, v18, 7
-; SI-NEXT:    v_readlane_b32 s38, v18, 6
-; SI-NEXT:    v_readlane_b32 s37, v18, 5
-; SI-NEXT:    v_readlane_b32 s36, v18, 4
-; SI-NEXT:    v_readlane_b32 s35, v18, 3
-; SI-NEXT:    v_readlane_b32 s34, v18, 2
-; SI-NEXT:    v_readlane_b32 s31, v18, 1
-; SI-NEXT:    v_readlane_b32 s30, v18, 0
+; SI-NEXT:    v_readlane_b32 s31, v18, 35
+; SI-NEXT:    v_readlane_b32 s99, v18, 33
+; SI-NEXT:    v_readlane_b32 s98, v18, 32
+; SI-NEXT:    v_readlane_b32 s97, v18, 31
+; SI-NEXT:    v_readlane_b32 s96, v18, 30
+; SI-NEXT:    v_readlane_b32 s87, v18, 29
+; SI-NEXT:    v_readlane_b32 s86, v18, 28
+; SI-NEXT:    v_readlane_b32 s85, v18, 27
+; SI-NEXT:    v_readlane_b32 s84, v18, 26
+; SI-NEXT:    v_readlane_b32 s83, v18, 25
+; SI-NEXT:    v_readlane_b32 s82, v18, 24
+; SI-NEXT:    v_readlane_b32 s81, v18, 23
+; SI-NEXT:    v_readlane_b32 s80, v18, 22
+; SI-NEXT:    v_readlane_b32 s71, v18, 21
+; SI-NEXT:    v_readlane_b32 s70, v18, 20
+; SI-NEXT:    v_readlane_b32 s69, v18, 19
+; SI-NEXT:    v_readlane_b32 s68, v18, 18
+; SI-NEXT:    v_readlane_b32 s67, v18, 17
+; SI-NEXT:    v_readlane_b32 s66, v18, 16
+; SI-NEXT:    v_readlane_b32 s65, v18, 15
+; SI-NEXT:    v_readlane_b32 s64, v18, 14
+; SI-NEXT:    v_readlane_b32 s55, v18, 13
+; SI-NEXT:    v_readlane_b32 s54, v18, 12
+; SI-NEXT:    v_readlane_b32 s53, v18, 11
+; SI-NEXT:    v_readlane_b32 s52, v18, 10
+; SI-NEXT:    v_readlane_b32 s51, v18, 9
+; SI-NEXT:    v_readlane_b32 s50, v18, 8
+; SI-NEXT:    v_readlane_b32 s49, v18, 7
+; SI-NEXT:    v_readlane_b32 s48, v18, 6
+; SI-NEXT:    v_readlane_b32 s39, v18, 5
+; SI-NEXT:    v_readlane_b32 s38, v18, 4
+; SI-NEXT:    v_readlane_b32 s37, v18, 3
+; SI-NEXT:    v_readlane_b32 s36, v18, 2
+; SI-NEXT:    v_readlane_b32 s35, v18, 1
+; SI-NEXT:    v_readlane_b32 s34, v18, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -78722,27 +78722,27 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v63, s31, 19
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
@@ -79137,26 +79137,26 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 18
+; VI-NEXT:    v_readlane_b32 s31, v63, 19
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v2, v29, v2, s4
 ; VI-NEXT:    v_perm_b32 v1, v50, v1, s4
@@ -79197,23 +79197,23 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -79544,22 +79544,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v34, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_perm_b32 v5, v33, v5, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -79610,18 +79610,18 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_mov_b32 s42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB105_3
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-NEXT:    s_lshr_b32 s43, s27, 24
@@ -79889,21 +79889,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX11-NEXT:    v_or_b32_e32 v2, v4, v17
 ; GFX11-NEXT:    v_or_b32_e32 v3, v19, v15
 ; GFX11-NEXT:    v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:32
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -79926,7 +79926,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
+define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -82405,7 +82405,7 @@ end:
   ret <32 x half> %phi
 }
 
-define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -82414,15 +82414,44 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s36, v28
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s89, v30
 ; SI-NEXT:    v_readfirstlane_b32 s90, v29
 ; SI-NEXT:    v_readfirstlane_b32 s88, v27
@@ -82448,43 +82477,14 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    s_mov_b32 s44, s29
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v41, s28, 0
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s23, 1
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s22, 2
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
 ; SI-NEXT:    v_writelane_b32 v41, s21, 3
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s20, 4
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
-; SI-NEXT:    s_mov_b32 s44, s29
 ; SI-NEXT:    s_mov_b32 s28, s25
 ; SI-NEXT:    s_mov_b32 s29, s24
 ; SI-NEXT:    v_readfirstlane_b32 s94, v23
@@ -83123,6 +83123,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s20, s45, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s20
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -83139,42 +83140,41 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v13, s7
 ; SI-NEXT:    v_mov_b32_e32 v14, s4
 ; SI-NEXT:    v_mov_b32_e32 v15, s5
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -84393,7 +84393,7 @@ end:
   ret <32 x half> %phi
 }
 
-define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
+define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -87378,7 +87378,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -87387,42 +87387,42 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
 ; SI-NEXT:    v_readfirstlane_b32 s42, v2
 ; SI-NEXT:    v_readfirstlane_b32 s44, v1
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
 ; SI-NEXT:    s_and_b32 s4, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_and_b32 s6, s28, 0xffff0000
@@ -87456,7 +87456,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    s_and_b32 s45, s44, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s44, s44, 16
 ; SI-NEXT:    v_readfirstlane_b32 s46, v3
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
 ; SI-NEXT:    s_cmp_lg_u32 s46, 0
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s41
 ; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s16
@@ -87490,7 +87490,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s44
 ; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s43
 ; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s42
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -87987,44 +87987,44 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_readlane_b32 s75, v41, 1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -88107,27 +88107,27 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v63, s34, 0
+; VI-NEXT:    v_writelane_b32 v63, s35, 1
+; VI-NEXT:    v_writelane_b32 v63, s36, 2
+; VI-NEXT:    v_writelane_b32 v63, s37, 3
+; VI-NEXT:    v_writelane_b32 v63, s38, 4
+; VI-NEXT:    v_writelane_b32 v63, s39, 5
+; VI-NEXT:    v_writelane_b32 v63, s48, 6
+; VI-NEXT:    v_writelane_b32 v63, s49, 7
+; VI-NEXT:    v_writelane_b32 v63, s50, 8
+; VI-NEXT:    v_writelane_b32 v63, s51, 9
+; VI-NEXT:    v_writelane_b32 v63, s52, 10
+; VI-NEXT:    v_writelane_b32 v63, s53, 11
+; VI-NEXT:    v_writelane_b32 v63, s54, 12
+; VI-NEXT:    v_writelane_b32 v63, s55, 13
+; VI-NEXT:    v_writelane_b32 v63, s64, 14
+; VI-NEXT:    v_writelane_b32 v63, s65, 15
+; VI-NEXT:    v_writelane_b32 v63, s66, 16
+; VI-NEXT:    v_writelane_b32 v63, s67, 17
+; VI-NEXT:    v_writelane_b32 v63, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v63, s31, 19
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
@@ -88758,26 +88758,26 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    v_perm_b32 v1, v22, v9, s4
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    v_readlane_b32 s30, v63, 18
+; VI-NEXT:    v_readlane_b32 s31, v63, 19
+; VI-NEXT:    v_readlane_b32 s67, v63, 17
+; VI-NEXT:    v_readlane_b32 s66, v63, 16
+; VI-NEXT:    v_readlane_b32 s65, v63, 15
+; VI-NEXT:    v_readlane_b32 s64, v63, 14
+; VI-NEXT:    v_readlane_b32 s55, v63, 13
+; VI-NEXT:    v_readlane_b32 s54, v63, 12
+; VI-NEXT:    v_readlane_b32 s53, v63, 11
+; VI-NEXT:    v_readlane_b32 s52, v63, 10
+; VI-NEXT:    v_readlane_b32 s51, v63, 9
+; VI-NEXT:    v_readlane_b32 s50, v63, 8
+; VI-NEXT:    v_readlane_b32 s49, v63, 7
+; VI-NEXT:    v_readlane_b32 s48, v63, 6
+; VI-NEXT:    v_readlane_b32 s39, v63, 5
+; VI-NEXT:    v_readlane_b32 s38, v63, 4
+; VI-NEXT:    v_readlane_b32 s37, v63, 3
+; VI-NEXT:    v_readlane_b32 s36, v63, 2
+; VI-NEXT:    v_readlane_b32 s35, v63, 1
+; VI-NEXT:    v_readlane_b32 s34, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -88810,23 +88810,23 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v63, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v63, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v63, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v63, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v63, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v63, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v63, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v63, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v63, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v63, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v63, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v63, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v63, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v63, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v63, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v63, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v63, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v63, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v63, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v63, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v63, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v63, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v63, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v63, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v63, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 14
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9-NEXT:    v_writelane_b32 v63, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 15
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -89383,22 +89383,22 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_perm_b32 v2, v44, v2, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v63, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v63, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v63, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v63, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v63, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v63, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v63, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v63, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v63, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v63, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v63, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v63, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v63, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v63, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v63, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v63, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v63, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v63, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v63, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v63, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v63, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v63, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
@@ -89499,18 +89499,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB109_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s27, 24
@@ -90066,21 +90066,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v20, v10
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v21, v7
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v22
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[11:14], off offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[16:19], off offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s0
@@ -90093,18 +90093,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s42, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB109_3
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s78, s27, 24
@@ -90665,21 +90665,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v19, v21
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v22, v23
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v13, v20
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 8
 ; GFX11-FAKE16-NEXT:    s_clause 0x3
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[14:17], off offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s0
@@ -90702,7 +90702,7 @@ end:
   ret <64 x i8> %phi
 }
 
-define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93193,7 +93193,7 @@ end:
   ret <32 x bfloat> %phi
 }
 
-define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v64i8_to_v32bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93202,15 +93202,13 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
-; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
+; SI-NEXT:    v_writelane_b32 v40, s36, 2
 ; SI-NEXT:    s_mov_b32 s6, s19
 ; SI-NEXT:    v_readfirstlane_b32 s19, v28
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s37, 3
 ; SI-NEXT:    v_readfirstlane_b32 s8, v30
 ; SI-NEXT:    v_readfirstlane_b32 s37, v29
 ; SI-NEXT:    v_readfirstlane_b32 s47, v27
@@ -93236,51 +93234,52 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v40, s38, 6
-; SI-NEXT:    v_writelane_b32 v40, s39, 7
-; SI-NEXT:    v_writelane_b32 v40, s48, 8
-; SI-NEXT:    v_writelane_b32 v40, s49, 9
-; SI-NEXT:    v_writelane_b32 v40, s50, 10
-; SI-NEXT:    v_writelane_b32 v40, s51, 11
-; SI-NEXT:    v_writelane_b32 v40, s52, 12
-; SI-NEXT:    v_writelane_b32 v40, s53, 13
-; SI-NEXT:    v_writelane_b32 v40, s54, 14
-; SI-NEXT:    v_writelane_b32 v40, s55, 15
-; SI-NEXT:    v_writelane_b32 v40, s64, 16
-; SI-NEXT:    v_writelane_b32 v40, s65, 17
-; SI-NEXT:    v_writelane_b32 v40, s66, 18
-; SI-NEXT:    v_writelane_b32 v40, s67, 19
-; SI-NEXT:    v_writelane_b32 v40, s68, 20
-; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s38, 4
+; SI-NEXT:    v_writelane_b32 v40, s39, 5
+; SI-NEXT:    v_writelane_b32 v40, s48, 6
+; SI-NEXT:    v_writelane_b32 v40, s49, 7
+; SI-NEXT:    v_writelane_b32 v40, s50, 8
+; SI-NEXT:    v_writelane_b32 v40, s51, 9
+; SI-NEXT:    v_writelane_b32 v40, s52, 10
+; SI-NEXT:    v_writelane_b32 v40, s53, 11
+; SI-NEXT:    v_writelane_b32 v40, s54, 12
+; SI-NEXT:    v_writelane_b32 v40, s55, 13
+; SI-NEXT:    v_writelane_b32 v40, s64, 14
+; SI-NEXT:    v_writelane_b32 v40, s65, 15
+; SI-NEXT:    v_writelane_b32 v40, s66, 16
+; SI-NEXT:    v_writelane_b32 v40, s67, 17
+; SI-NEXT:    v_writelane_b32 v40, s68, 18
+; SI-NEXT:    v_writelane_b32 v40, s69, 19
+; SI-NEXT:    v_writelane_b32 v40, s70, 20
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s71, 21
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v41, s18, 0
-; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s80, 22
 ; SI-NEXT:    v_writelane_b32 v41, s6, 1
-; SI-NEXT:    v_writelane_b32 v40, s80, 24
+; SI-NEXT:    v_writelane_b32 v40, s81, 23
 ; SI-NEXT:    v_writelane_b32 v41, s17, 2
-; SI-NEXT:    v_writelane_b32 v40, s81, 25
+; SI-NEXT:    v_writelane_b32 v40, s82, 24
 ; SI-NEXT:    v_writelane_b32 v41, s16, 3
-; SI-NEXT:    v_writelane_b32 v40, s82, 26
+; SI-NEXT:    v_writelane_b32 v40, s83, 25
 ; SI-NEXT:    v_writelane_b32 v41, s22, 4
-; SI-NEXT:    v_writelane_b32 v40, s83, 27
+; SI-NEXT:    v_writelane_b32 v40, s84, 26
 ; SI-NEXT:    v_writelane_b32 v41, s23, 5
-; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s85, 27
 ; SI-NEXT:    v_writelane_b32 v41, s21, 6
-; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    v_writelane_b32 v40, s86, 28
 ; SI-NEXT:    v_writelane_b32 v41, s20, 7
-; SI-NEXT:    v_writelane_b32 v40, s86, 30
+; SI-NEXT:    v_writelane_b32 v40, s87, 29
 ; SI-NEXT:    v_writelane_b32 v41, s26, 8
-; SI-NEXT:    v_writelane_b32 v40, s87, 31
+; SI-NEXT:    v_writelane_b32 v40, s96, 30
 ; SI-NEXT:    v_writelane_b32 v41, s27, 9
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
+; SI-NEXT:    v_writelane_b32 v40, s97, 31
 ; SI-NEXT:    v_writelane_b32 v41, s25, 10
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
+; SI-NEXT:    v_writelane_b32 v40, s98, 32
 ; SI-NEXT:    v_writelane_b32 v41, s24, 11
-; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s99, 33
 ; SI-NEXT:    v_writelane_b32 v41, s29, 12
-; SI-NEXT:    v_writelane_b32 v40, s99, 35
+; SI-NEXT:    v_writelane_b32 v40, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s75, v23
 ; SI-NEXT:    v_readfirstlane_b32 s76, v22
 ; SI-NEXT:    v_readfirstlane_b32 s62, v21
@@ -93309,9 +93308,9 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s87, v24
 ; SI-NEXT:    v_readfirstlane_b32 s86, v25
 ; SI-NEXT:    v_readfirstlane_b32 s98, v26
-; SI-NEXT:    v_readfirstlane_b32 s99, v27
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s99, v27
 ; SI-NEXT:    v_readfirstlane_b32 s82, v29
 ; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s81, v30
@@ -93342,6 +93341,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s51, v51
 ; SI-NEXT:    v_writelane_b32 v41, s28, 13
+; SI-NEXT:    v_writelane_b32 v40, s31, 35
 ; SI-NEXT:    v_writelane_b32 v41, s7, 14
 ; SI-NEXT:    s_cbranch_scc0 .LBB111_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -93795,43 +93795,43 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s69
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s71
+; SI-NEXT:    v_readlane_b32 s30, v40, 34
 ; SI-NEXT:    v_lshr_b64 v[15:16], v[15:16], 16
-; SI-NEXT:    v_readlane_b32 s99, v40, 35
-; SI-NEXT:    v_readlane_b32 s98, v40, 34
-; SI-NEXT:    v_readlane_b32 s97, v40, 33
-; SI-NEXT:    v_readlane_b32 s96, v40, 32
-; SI-NEXT:    v_readlane_b32 s87, v40, 31
-; SI-NEXT:    v_readlane_b32 s86, v40, 30
-; SI-NEXT:    v_readlane_b32 s85, v40, 29
-; SI-NEXT:    v_readlane_b32 s84, v40, 28
-; SI-NEXT:    v_readlane_b32 s83, v40, 27
-; SI-NEXT:    v_readlane_b32 s82, v40, 26
-; SI-NEXT:    v_readlane_b32 s81, v40, 25
-; SI-NEXT:    v_readlane_b32 s80, v40, 24
-; SI-NEXT:    v_readlane_b32 s71, v40, 23
-; SI-NEXT:    v_readlane_b32 s70, v40, 22
-; SI-NEXT:    v_readlane_b32 s69, v40, 21
-; SI-NEXT:    v_readlane_b32 s68, v40, 20
-; SI-NEXT:    v_readlane_b32 s67, v40, 19
-; SI-NEXT:    v_readlane_b32 s66, v40, 18
-; SI-NEXT:    v_readlane_b32 s65, v40, 17
-; SI-NEXT:    v_readlane_b32 s64, v40, 16
-; SI-NEXT:    v_readlane_b32 s55, v40, 15
-; SI-NEXT:    v_readlane_b32 s54, v40, 14
-; SI-NEXT:    v_readlane_b32 s53, v40, 13
-; SI-NEXT:    v_readlane_b32 s52, v40, 12
-; SI-NEXT:    v_readlane_b32 s51, v40, 11
-; SI-NEXT:    v_readlane_b32 s50, v40, 10
-; SI-NEXT:    v_readlane_b32 s49, v40, 9
-; SI-NEXT:    v_readlane_b32 s48, v40, 8
-; SI-NEXT:    v_readlane_b32 s39, v40, 7
-; SI-NEXT:    v_readlane_b32 s38, v40, 6
-; SI-NEXT:    v_readlane_b32 s37, v40, 5
-; SI-NEXT:    v_readlane_b32 s36, v40, 4
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 35
+; SI-NEXT:    v_readlane_b32 s99, v40, 33
+; SI-NEXT:    v_readlane_b32 s98, v40, 32
+; SI-NEXT:    v_readlane_b32 s97, v40, 31
+; SI-NEXT:    v_readlane_b32 s96, v40, 30
+; SI-NEXT:    v_readlane_b32 s87, v40, 29
+; SI-NEXT:    v_readlane_b32 s86, v40, 28
+; SI-NEXT:    v_readlane_b32 s85, v40, 27
+; SI-NEXT:    v_readlane_b32 s84, v40, 26
+; SI-NEXT:    v_readlane_b32 s83, v40, 25
+; SI-NEXT:    v_readlane_b32 s82, v40, 24
+; SI-NEXT:    v_readlane_b32 s81, v40, 23
+; SI-NEXT:    v_readlane_b32 s80, v40, 22
+; SI-NEXT:    v_readlane_b32 s71, v40, 21
+; SI-NEXT:    v_readlane_b32 s70, v40, 20
+; SI-NEXT:    v_readlane_b32 s69, v40, 19
+; SI-NEXT:    v_readlane_b32 s68, v40, 18
+; SI-NEXT:    v_readlane_b32 s67, v40, 17
+; SI-NEXT:    v_readlane_b32 s66, v40, 16
+; SI-NEXT:    v_readlane_b32 s65, v40, 15
+; SI-NEXT:    v_readlane_b32 s64, v40, 14
+; SI-NEXT:    v_readlane_b32 s55, v40, 13
+; SI-NEXT:    v_readlane_b32 s54, v40, 12
+; SI-NEXT:    v_readlane_b32 s53, v40, 11
+; SI-NEXT:    v_readlane_b32 s52, v40, 10
+; SI-NEXT:    v_readlane_b32 s51, v40, 9
+; SI-NEXT:    v_readlane_b32 s50, v40, 8
+; SI-NEXT:    v_readlane_b32 s49, v40, 7
+; SI-NEXT:    v_readlane_b32 s48, v40, 6
+; SI-NEXT:    v_readlane_b32 s39, v40, 5
+; SI-NEXT:    v_readlane_b32 s38, v40, 4
+; SI-NEXT:    v_readlane_b32 s37, v40, 3
+; SI-NEXT:    v_readlane_b32 s36, v40, 2
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -95084,3 +95084,5 @@ end:
   %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <32 x bfloat> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 39a3c96b9839a..e04fb2918a8ca 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) {
+define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -148,7 +148,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v18i32_to_v18f32_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v18i32_to_v18f32_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -371,7 +371,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) {
+define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -504,7 +504,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -952,7 +952,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) {
+define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1094,7 +1094,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v18i32_to_v9i64_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v18i32_to_v9i64_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1317,7 +1317,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) {
+define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1464,7 +1464,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v9i64_to_v18i32_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v9i64_to_v18i32_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1687,7 +1687,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) {
+define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1829,7 +1829,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v18i32_to_v9f64_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v18i32_to_v9f64_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2052,7 +2052,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) {
+define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2158,7 +2158,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2570,7 +2570,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) {
+define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3100,7 +3100,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3683,7 +3683,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) {
+define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4424,7 +4424,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5138,7 +5138,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
+define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5668,7 +5668,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18i32_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6251,7 +6251,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
+define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7066,7 +7066,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v18i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7850,7 +7850,7 @@ end:
   ret <18 x i32> %phi
 }
 
-define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) {
+define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7983,7 +7983,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8431,7 +8431,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) {
+define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8578,7 +8578,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v9i64_to_v18f32_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v9i64_to_v18f32_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8801,7 +8801,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) {
+define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8934,7 +8934,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9382,7 +9382,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) {
+define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9488,7 +9488,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9900,7 +9900,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) {
+define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10412,7 +10412,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11210,7 +11210,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) {
+define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11951,7 +11951,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12665,7 +12665,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
+define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13177,7 +13177,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v18f32_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13975,7 +13975,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
+define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14790,7 +14790,7 @@ end:
   ret <18 x float> %phi
 }
 
-define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v18f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15574,7 +15574,7 @@ end:
   ret <18 x float> %phi
 }
 
-define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) {
+define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15721,7 +15721,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v9i64_to_v9f64_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v9i64_to_v9f64_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15943,7 +15943,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) {
+define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16049,7 +16049,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16461,7 +16461,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) {
+define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17001,7 +17001,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17584,7 +17584,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) {
+define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18325,7 +18325,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19039,7 +19039,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
+define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19579,7 +19579,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9i64_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20162,7 +20162,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
+define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20977,7 +20977,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v9i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21761,7 +21761,7 @@ end:
   ret <9 x i64> %phi
 }
 
-define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) {
+define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22246,7 +22246,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22999,7 +22999,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) {
+define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23740,7 +23740,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24454,7 +24454,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
+define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24939,7 +24939,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v9f64_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25692,7 +25692,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
+define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26507,7 +26507,7 @@ end:
   ret <9 x double> %phi
 }
 
-define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v9f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27291,7 +27291,7 @@ end:
   ret <9 x double> %phi
 }
 
-define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
+define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28045,7 +28045,7 @@ end:
   ret <36 x half> %phi
 }
 
-define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36i16_to_v36f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28053,26 +28053,26 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v18, s30, 0
-; SI-NEXT:    v_writelane_b32 v18, s31, 1
-; SI-NEXT:    v_writelane_b32 v18, s34, 2
-; SI-NEXT:    v_writelane_b32 v18, s35, 3
-; SI-NEXT:    v_writelane_b32 v18, s36, 4
-; SI-NEXT:    v_writelane_b32 v18, s37, 5
-; SI-NEXT:    v_writelane_b32 v18, s38, 6
-; SI-NEXT:    v_writelane_b32 v18, s39, 7
-; SI-NEXT:    v_writelane_b32 v18, s48, 8
-; SI-NEXT:    v_writelane_b32 v18, s49, 9
-; SI-NEXT:    v_writelane_b32 v18, s50, 10
-; SI-NEXT:    v_writelane_b32 v18, s51, 11
-; SI-NEXT:    v_writelane_b32 v18, s52, 12
-; SI-NEXT:    v_writelane_b32 v18, s53, 13
-; SI-NEXT:    v_writelane_b32 v18, s54, 14
-; SI-NEXT:    v_writelane_b32 v18, s55, 15
+; SI-NEXT:    v_writelane_b32 v18, s34, 0
+; SI-NEXT:    v_writelane_b32 v18, s35, 1
+; SI-NEXT:    v_writelane_b32 v18, s36, 2
+; SI-NEXT:    v_writelane_b32 v18, s37, 3
+; SI-NEXT:    v_writelane_b32 v18, s38, 4
+; SI-NEXT:    v_writelane_b32 v18, s39, 5
+; SI-NEXT:    v_writelane_b32 v18, s48, 6
+; SI-NEXT:    v_writelane_b32 v18, s49, 7
+; SI-NEXT:    v_writelane_b32 v18, s50, 8
+; SI-NEXT:    v_writelane_b32 v18, s51, 9
+; SI-NEXT:    v_writelane_b32 v18, s52, 10
+; SI-NEXT:    v_writelane_b32 v18, s53, 11
+; SI-NEXT:    v_writelane_b32 v18, s54, 12
+; SI-NEXT:    v_writelane_b32 v18, s55, 13
+; SI-NEXT:    v_writelane_b32 v18, s30, 14
 ; SI-NEXT:    v_readfirstlane_b32 s53, v3
 ; SI-NEXT:    v_readfirstlane_b32 s55, v2
 ; SI-NEXT:    v_readfirstlane_b32 s50, v1
 ; SI-NEXT:    v_readfirstlane_b32 s52, v0
+; SI-NEXT:    v_writelane_b32 v18, s31, 15
 ; SI-NEXT:    s_lshr_b32 s30, s29, 16
 ; SI-NEXT:    s_lshr_b32 s49, s28, 16
 ; SI-NEXT:    s_lshr_b32 s95, s27, 16
@@ -28332,6 +28332,7 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s21, s43, 0xffff
 ; SI-NEXT:    s_lshl_b32 s22, s34, 16
 ; SI-NEXT:    s_or_b32 s21, s21, s22
+; SI-NEXT:    v_readlane_b32 s30, v18, 14
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s14
@@ -28350,22 +28351,21 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v15, s19
 ; SI-NEXT:    v_mov_b32_e32 v16, s20
 ; SI-NEXT:    v_mov_b32_e32 v17, s21
-; SI-NEXT:    v_readlane_b32 s55, v18, 15
-; SI-NEXT:    v_readlane_b32 s54, v18, 14
-; SI-NEXT:    v_readlane_b32 s53, v18, 13
-; SI-NEXT:    v_readlane_b32 s52, v18, 12
-; SI-NEXT:    v_readlane_b32 s51, v18, 11
-; SI-NEXT:    v_readlane_b32 s50, v18, 10
-; SI-NEXT:    v_readlane_b32 s49, v18, 9
-; SI-NEXT:    v_readlane_b32 s48, v18, 8
-; SI-NEXT:    v_readlane_b32 s39, v18, 7
-; SI-NEXT:    v_readlane_b32 s38, v18, 6
-; SI-NEXT:    v_readlane_b32 s37, v18, 5
-; SI-NEXT:    v_readlane_b32 s36, v18, 4
-; SI-NEXT:    v_readlane_b32 s35, v18, 3
-; SI-NEXT:    v_readlane_b32 s34, v18, 2
-; SI-NEXT:    v_readlane_b32 s31, v18, 1
-; SI-NEXT:    v_readlane_b32 s30, v18, 0
+; SI-NEXT:    v_readlane_b32 s31, v18, 15
+; SI-NEXT:    v_readlane_b32 s55, v18, 13
+; SI-NEXT:    v_readlane_b32 s54, v18, 12
+; SI-NEXT:    v_readlane_b32 s53, v18, 11
+; SI-NEXT:    v_readlane_b32 s52, v18, 10
+; SI-NEXT:    v_readlane_b32 s51, v18, 9
+; SI-NEXT:    v_readlane_b32 s50, v18, 8
+; SI-NEXT:    v_readlane_b32 s49, v18, 7
+; SI-NEXT:    v_readlane_b32 s48, v18, 6
+; SI-NEXT:    v_readlane_b32 s39, v18, 5
+; SI-NEXT:    v_readlane_b32 s38, v18, 4
+; SI-NEXT:    v_readlane_b32 s37, v18, 3
+; SI-NEXT:    v_readlane_b32 s36, v18, 2
+; SI-NEXT:    v_readlane_b32 s35, v18, 1
+; SI-NEXT:    v_readlane_b32 s34, v18, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28988,7 +28988,7 @@ end:
   ret <36 x half> %phi
 }
 
-define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
+define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v36i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29588,7 +29588,7 @@ end:
   ret <36 x i16> %phi
 }
 
-define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v36f16_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30487,3 +30487,5 @@ end:
   %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <36 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 666b5353465eb..b59e14fe0da33 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) {
+define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -156,7 +156,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v20i32_to_v20f32_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v20i32_to_v20f32_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -402,7 +402,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) {
+define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -542,7 +542,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1022,7 +1022,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) {
+define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1172,7 +1172,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v20i32_to_v10i64_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v20i32_to_v10i64_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1418,7 +1418,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) {
+define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1573,7 +1573,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v10i64_to_v20i32_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v10i64_to_v20i32_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1819,7 +1819,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) {
+define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1969,7 +1969,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v20i32_to_v10f64_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v20i32_to_v10f64_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2215,7 +2215,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) {
+define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2325,7 +2325,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2765,7 +2765,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) {
+define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3343,7 +3343,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3989,7 +3989,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
+define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4837,7 +4837,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5105,30 +5105,32 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s34, 0
+; VI-NEXT:    v_writelane_b32 v20, s35, 1
+; VI-NEXT:    v_writelane_b32 v20, s36, 2
+; VI-NEXT:    v_writelane_b32 v20, s37, 3
+; VI-NEXT:    v_writelane_b32 v20, s38, 4
+; VI-NEXT:    v_writelane_b32 v20, s39, 5
+; VI-NEXT:    v_writelane_b32 v20, s48, 6
+; VI-NEXT:    v_writelane_b32 v20, s49, 7
+; VI-NEXT:    v_writelane_b32 v20, s50, 8
+; VI-NEXT:    v_writelane_b32 v20, s51, 9
+; VI-NEXT:    v_writelane_b32 v20, s52, 10
+; VI-NEXT:    v_writelane_b32 v20, s53, 11
+; VI-NEXT:    v_writelane_b32 v20, s54, 12
+; VI-NEXT:    v_writelane_b32 v20, s55, 13
+; VI-NEXT:    v_writelane_b32 v20, s64, 14
+; VI-NEXT:    v_writelane_b32 v20, s65, 15
+; VI-NEXT:    v_writelane_b32 v20, s66, 16
+; VI-NEXT:    v_writelane_b32 v20, s67, 17
+; VI-NEXT:    v_writelane_b32 v20, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s7, v5
 ; VI-NEXT:    v_readfirstlane_b32 s9, v4
 ; VI-NEXT:    v_readfirstlane_b32 s12, v3
 ; VI-NEXT:    v_readfirstlane_b32 s15, v2
 ; VI-NEXT:    v_readfirstlane_b32 s74, v1
 ; VI-NEXT:    v_readfirstlane_b32 s77, v0
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s31, 19
 ; VI-NEXT:    s_lshr_b32 s11, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -5150,9 +5152,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_lshr_b32 s73, s74, 16
 ; VI-NEXT:    s_lshr_b32 s76, s77, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB15_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -5318,6 +5318,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s55, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v20, 18
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -5338,26 +5339,25 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v17, s53
 ; VI-NEXT:    v_mov_b32_e32 v18, s54
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
-; VI-NEXT:    v_readlane_b32 s67, v20, 19
-; VI-NEXT:    v_readlane_b32 s66, v20, 18
-; VI-NEXT:    v_readlane_b32 s65, v20, 17
-; VI-NEXT:    v_readlane_b32 s64, v20, 16
-; VI-NEXT:    v_readlane_b32 s55, v20, 15
-; VI-NEXT:    v_readlane_b32 s54, v20, 14
-; VI-NEXT:    v_readlane_b32 s53, v20, 13
-; VI-NEXT:    v_readlane_b32 s52, v20, 12
-; VI-NEXT:    v_readlane_b32 s51, v20, 11
-; VI-NEXT:    v_readlane_b32 s50, v20, 10
-; VI-NEXT:    v_readlane_b32 s49, v20, 9
-; VI-NEXT:    v_readlane_b32 s48, v20, 8
-; VI-NEXT:    v_readlane_b32 s39, v20, 7
-; VI-NEXT:    v_readlane_b32 s38, v20, 6
-; VI-NEXT:    v_readlane_b32 s37, v20, 5
-; VI-NEXT:    v_readlane_b32 s36, v20, 4
-; VI-NEXT:    v_readlane_b32 s35, v20, 3
-; VI-NEXT:    v_readlane_b32 s34, v20, 2
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
-; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 19
+; VI-NEXT:    v_readlane_b32 s67, v20, 17
+; VI-NEXT:    v_readlane_b32 s66, v20, 16
+; VI-NEXT:    v_readlane_b32 s65, v20, 15
+; VI-NEXT:    v_readlane_b32 s64, v20, 14
+; VI-NEXT:    v_readlane_b32 s55, v20, 13
+; VI-NEXT:    v_readlane_b32 s54, v20, 12
+; VI-NEXT:    v_readlane_b32 s53, v20, 11
+; VI-NEXT:    v_readlane_b32 s52, v20, 10
+; VI-NEXT:    v_readlane_b32 s51, v20, 9
+; VI-NEXT:    v_readlane_b32 s50, v20, 8
+; VI-NEXT:    v_readlane_b32 s49, v20, 7
+; VI-NEXT:    v_readlane_b32 s48, v20, 6
+; VI-NEXT:    v_readlane_b32 s39, v20, 5
+; VI-NEXT:    v_readlane_b32 s38, v20, 4
+; VI-NEXT:    v_readlane_b32 s37, v20, 3
+; VI-NEXT:    v_readlane_b32 s36, v20, 2
+; VI-NEXT:    v_readlane_b32 s35, v20, 1
+; VI-NEXT:    v_readlane_b32 s34, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5623,7 +5623,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
+define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6201,7 +6201,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20i32_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6847,7 +6847,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) {
+define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7780,7 +7780,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v20i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8122,30 +8122,32 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v5
 ; VI-NEXT:    v_readfirstlane_b32 s8, v4
 ; VI-NEXT:    v_readfirstlane_b32 s11, v3
 ; VI-NEXT:    v_readfirstlane_b32 s14, v2
 ; VI-NEXT:    v_readfirstlane_b32 s73, v1
 ; VI-NEXT:    v_readfirstlane_b32 s76, v0
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
+; VI-NEXT:    v_writelane_b32 v32, s31, 19
 ; VI-NEXT:    s_lshr_b32 s10, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -8167,9 +8169,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    s_lshr_b32 s75, s73, 16
 ; VI-NEXT:    s_lshr_b32 s78, s76, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB19_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -8353,26 +8353,26 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 18
+; VI-NEXT:    v_readlane_b32 s31, v32, 19
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8636,7 +8636,7 @@ end:
   ret <20 x i32> %phi
 }
 
-define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) {
+define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8776,7 +8776,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9256,7 +9256,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) {
+define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9411,7 +9411,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v10i64_to_v20f32_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v10i64_to_v20f32_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9657,7 +9657,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) {
+define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9797,7 +9797,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10277,7 +10277,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) {
+define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10387,7 +10387,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10827,7 +10827,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) {
+define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11385,7 +11385,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12269,7 +12269,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
+define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13117,7 +13117,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13385,30 +13385,32 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s34, 0
+; VI-NEXT:    v_writelane_b32 v20, s35, 1
+; VI-NEXT:    v_writelane_b32 v20, s36, 2
+; VI-NEXT:    v_writelane_b32 v20, s37, 3
+; VI-NEXT:    v_writelane_b32 v20, s38, 4
+; VI-NEXT:    v_writelane_b32 v20, s39, 5
+; VI-NEXT:    v_writelane_b32 v20, s48, 6
+; VI-NEXT:    v_writelane_b32 v20, s49, 7
+; VI-NEXT:    v_writelane_b32 v20, s50, 8
+; VI-NEXT:    v_writelane_b32 v20, s51, 9
+; VI-NEXT:    v_writelane_b32 v20, s52, 10
+; VI-NEXT:    v_writelane_b32 v20, s53, 11
+; VI-NEXT:    v_writelane_b32 v20, s54, 12
+; VI-NEXT:    v_writelane_b32 v20, s55, 13
+; VI-NEXT:    v_writelane_b32 v20, s64, 14
+; VI-NEXT:    v_writelane_b32 v20, s65, 15
+; VI-NEXT:    v_writelane_b32 v20, s66, 16
+; VI-NEXT:    v_writelane_b32 v20, s67, 17
+; VI-NEXT:    v_writelane_b32 v20, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s7, v5
 ; VI-NEXT:    v_readfirstlane_b32 s9, v4
 ; VI-NEXT:    v_readfirstlane_b32 s12, v3
 ; VI-NEXT:    v_readfirstlane_b32 s15, v2
 ; VI-NEXT:    v_readfirstlane_b32 s74, v1
 ; VI-NEXT:    v_readfirstlane_b32 s77, v0
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s31, 19
 ; VI-NEXT:    s_lshr_b32 s11, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -13430,9 +13432,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_lshr_b32 s73, s74, 16
 ; VI-NEXT:    s_lshr_b32 s76, s77, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB31_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -13598,6 +13598,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s55, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v20, 18
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -13618,26 +13619,25 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v17, s53
 ; VI-NEXT:    v_mov_b32_e32 v18, s54
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
-; VI-NEXT:    v_readlane_b32 s67, v20, 19
-; VI-NEXT:    v_readlane_b32 s66, v20, 18
-; VI-NEXT:    v_readlane_b32 s65, v20, 17
-; VI-NEXT:    v_readlane_b32 s64, v20, 16
-; VI-NEXT:    v_readlane_b32 s55, v20, 15
-; VI-NEXT:    v_readlane_b32 s54, v20, 14
-; VI-NEXT:    v_readlane_b32 s53, v20, 13
-; VI-NEXT:    v_readlane_b32 s52, v20, 12
-; VI-NEXT:    v_readlane_b32 s51, v20, 11
-; VI-NEXT:    v_readlane_b32 s50, v20, 10
-; VI-NEXT:    v_readlane_b32 s49, v20, 9
-; VI-NEXT:    v_readlane_b32 s48, v20, 8
-; VI-NEXT:    v_readlane_b32 s39, v20, 7
-; VI-NEXT:    v_readlane_b32 s38, v20, 6
-; VI-NEXT:    v_readlane_b32 s37, v20, 5
-; VI-NEXT:    v_readlane_b32 s36, v20, 4
-; VI-NEXT:    v_readlane_b32 s35, v20, 3
-; VI-NEXT:    v_readlane_b32 s34, v20, 2
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
-; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 19
+; VI-NEXT:    v_readlane_b32 s67, v20, 17
+; VI-NEXT:    v_readlane_b32 s66, v20, 16
+; VI-NEXT:    v_readlane_b32 s65, v20, 15
+; VI-NEXT:    v_readlane_b32 s64, v20, 14
+; VI-NEXT:    v_readlane_b32 s55, v20, 13
+; VI-NEXT:    v_readlane_b32 s54, v20, 12
+; VI-NEXT:    v_readlane_b32 s53, v20, 11
+; VI-NEXT:    v_readlane_b32 s52, v20, 10
+; VI-NEXT:    v_readlane_b32 s51, v20, 9
+; VI-NEXT:    v_readlane_b32 s50, v20, 8
+; VI-NEXT:    v_readlane_b32 s49, v20, 7
+; VI-NEXT:    v_readlane_b32 s48, v20, 6
+; VI-NEXT:    v_readlane_b32 s39, v20, 5
+; VI-NEXT:    v_readlane_b32 s38, v20, 4
+; VI-NEXT:    v_readlane_b32 s37, v20, 3
+; VI-NEXT:    v_readlane_b32 s36, v20, 2
+; VI-NEXT:    v_readlane_b32 s35, v20, 1
+; VI-NEXT:    v_readlane_b32 s34, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -13903,7 +13903,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
+define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14461,7 +14461,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v20f32_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15345,7 +15345,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) {
+define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16278,7 +16278,7 @@ end:
   ret <20 x float> %phi
 }
 
-define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v20f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16620,30 +16620,32 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v5
 ; VI-NEXT:    v_readfirstlane_b32 s8, v4
 ; VI-NEXT:    v_readfirstlane_b32 s11, v3
 ; VI-NEXT:    v_readfirstlane_b32 s14, v2
 ; VI-NEXT:    v_readfirstlane_b32 s73, v1
 ; VI-NEXT:    v_readfirstlane_b32 s76, v0
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
+; VI-NEXT:    v_writelane_b32 v32, s31, 19
 ; VI-NEXT:    s_lshr_b32 s10, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -16665,9 +16667,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; VI-NEXT:    s_lshr_b32 s75, s73, 16
 ; VI-NEXT:    s_lshr_b32 s78, s76, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB35_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -16851,26 +16851,26 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 18
+; VI-NEXT:    v_readlane_b32 s31, v32, 19
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17134,7 +17134,7 @@ end:
   ret <20 x float> %phi
 }
 
-define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) {
+define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17289,7 +17289,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v10i64_to_v10f64_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v10i64_to_v10f64_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17534,7 +17534,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) {
+define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17644,7 +17644,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18084,7 +18084,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) {
+define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18672,7 +18672,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19318,7 +19318,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
+define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20166,7 +20166,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20434,30 +20434,32 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s34, 0
+; VI-NEXT:    v_writelane_b32 v20, s35, 1
+; VI-NEXT:    v_writelane_b32 v20, s36, 2
+; VI-NEXT:    v_writelane_b32 v20, s37, 3
+; VI-NEXT:    v_writelane_b32 v20, s38, 4
+; VI-NEXT:    v_writelane_b32 v20, s39, 5
+; VI-NEXT:    v_writelane_b32 v20, s48, 6
+; VI-NEXT:    v_writelane_b32 v20, s49, 7
+; VI-NEXT:    v_writelane_b32 v20, s50, 8
+; VI-NEXT:    v_writelane_b32 v20, s51, 9
+; VI-NEXT:    v_writelane_b32 v20, s52, 10
+; VI-NEXT:    v_writelane_b32 v20, s53, 11
+; VI-NEXT:    v_writelane_b32 v20, s54, 12
+; VI-NEXT:    v_writelane_b32 v20, s55, 13
+; VI-NEXT:    v_writelane_b32 v20, s64, 14
+; VI-NEXT:    v_writelane_b32 v20, s65, 15
+; VI-NEXT:    v_writelane_b32 v20, s66, 16
+; VI-NEXT:    v_writelane_b32 v20, s67, 17
+; VI-NEXT:    v_writelane_b32 v20, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s7, v5
 ; VI-NEXT:    v_readfirstlane_b32 s9, v4
 ; VI-NEXT:    v_readfirstlane_b32 s12, v3
 ; VI-NEXT:    v_readfirstlane_b32 s15, v2
 ; VI-NEXT:    v_readfirstlane_b32 s74, v1
 ; VI-NEXT:    v_readfirstlane_b32 s77, v0
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s31, 19
 ; VI-NEXT:    s_lshr_b32 s11, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -20479,9 +20481,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_lshr_b32 s73, s74, 16
 ; VI-NEXT:    s_lshr_b32 s76, s77, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -20647,6 +20647,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s55, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v20, 18
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -20667,26 +20668,25 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v17, s53
 ; VI-NEXT:    v_mov_b32_e32 v18, s54
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
-; VI-NEXT:    v_readlane_b32 s67, v20, 19
-; VI-NEXT:    v_readlane_b32 s66, v20, 18
-; VI-NEXT:    v_readlane_b32 s65, v20, 17
-; VI-NEXT:    v_readlane_b32 s64, v20, 16
-; VI-NEXT:    v_readlane_b32 s55, v20, 15
-; VI-NEXT:    v_readlane_b32 s54, v20, 14
-; VI-NEXT:    v_readlane_b32 s53, v20, 13
-; VI-NEXT:    v_readlane_b32 s52, v20, 12
-; VI-NEXT:    v_readlane_b32 s51, v20, 11
-; VI-NEXT:    v_readlane_b32 s50, v20, 10
-; VI-NEXT:    v_readlane_b32 s49, v20, 9
-; VI-NEXT:    v_readlane_b32 s48, v20, 8
-; VI-NEXT:    v_readlane_b32 s39, v20, 7
-; VI-NEXT:    v_readlane_b32 s38, v20, 6
-; VI-NEXT:    v_readlane_b32 s37, v20, 5
-; VI-NEXT:    v_readlane_b32 s36, v20, 4
-; VI-NEXT:    v_readlane_b32 s35, v20, 3
-; VI-NEXT:    v_readlane_b32 s34, v20, 2
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
-; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 19
+; VI-NEXT:    v_readlane_b32 s67, v20, 17
+; VI-NEXT:    v_readlane_b32 s66, v20, 16
+; VI-NEXT:    v_readlane_b32 s65, v20, 15
+; VI-NEXT:    v_readlane_b32 s64, v20, 14
+; VI-NEXT:    v_readlane_b32 s55, v20, 13
+; VI-NEXT:    v_readlane_b32 s54, v20, 12
+; VI-NEXT:    v_readlane_b32 s53, v20, 11
+; VI-NEXT:    v_readlane_b32 s52, v20, 10
+; VI-NEXT:    v_readlane_b32 s51, v20, 9
+; VI-NEXT:    v_readlane_b32 s50, v20, 8
+; VI-NEXT:    v_readlane_b32 s49, v20, 7
+; VI-NEXT:    v_readlane_b32 s48, v20, 6
+; VI-NEXT:    v_readlane_b32 s39, v20, 5
+; VI-NEXT:    v_readlane_b32 s38, v20, 4
+; VI-NEXT:    v_readlane_b32 s37, v20, 3
+; VI-NEXT:    v_readlane_b32 s36, v20, 2
+; VI-NEXT:    v_readlane_b32 s35, v20, 1
+; VI-NEXT:    v_readlane_b32 s34, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20952,7 +20952,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
+define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21540,7 +21540,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10i64_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22186,7 +22186,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) {
+define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23119,7 +23119,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v10i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23461,30 +23461,32 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v5
 ; VI-NEXT:    v_readfirstlane_b32 s8, v4
 ; VI-NEXT:    v_readfirstlane_b32 s11, v3
 ; VI-NEXT:    v_readfirstlane_b32 s14, v2
 ; VI-NEXT:    v_readfirstlane_b32 s73, v1
 ; VI-NEXT:    v_readfirstlane_b32 s76, v0
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
+; VI-NEXT:    v_writelane_b32 v32, s31, 19
 ; VI-NEXT:    s_lshr_b32 s10, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -23506,9 +23508,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    s_lshr_b32 s75, s73, 16
 ; VI-NEXT:    s_lshr_b32 s78, s76, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB47_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -23692,26 +23692,26 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 18
+; VI-NEXT:    v_readlane_b32 s31, v32, 19
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23975,7 +23975,7 @@ end:
   ret <10 x i64> %phi
 }
 
-define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) {
+define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24503,7 +24503,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25337,7 +25337,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
+define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26185,7 +26185,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26453,30 +26453,32 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s34, 0
+; VI-NEXT:    v_writelane_b32 v20, s35, 1
+; VI-NEXT:    v_writelane_b32 v20, s36, 2
+; VI-NEXT:    v_writelane_b32 v20, s37, 3
+; VI-NEXT:    v_writelane_b32 v20, s38, 4
+; VI-NEXT:    v_writelane_b32 v20, s39, 5
+; VI-NEXT:    v_writelane_b32 v20, s48, 6
+; VI-NEXT:    v_writelane_b32 v20, s49, 7
+; VI-NEXT:    v_writelane_b32 v20, s50, 8
+; VI-NEXT:    v_writelane_b32 v20, s51, 9
+; VI-NEXT:    v_writelane_b32 v20, s52, 10
+; VI-NEXT:    v_writelane_b32 v20, s53, 11
+; VI-NEXT:    v_writelane_b32 v20, s54, 12
+; VI-NEXT:    v_writelane_b32 v20, s55, 13
+; VI-NEXT:    v_writelane_b32 v20, s64, 14
+; VI-NEXT:    v_writelane_b32 v20, s65, 15
+; VI-NEXT:    v_writelane_b32 v20, s66, 16
+; VI-NEXT:    v_writelane_b32 v20, s67, 17
+; VI-NEXT:    v_writelane_b32 v20, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s7, v5
 ; VI-NEXT:    v_readfirstlane_b32 s9, v4
 ; VI-NEXT:    v_readfirstlane_b32 s12, v3
 ; VI-NEXT:    v_readfirstlane_b32 s15, v2
 ; VI-NEXT:    v_readfirstlane_b32 s74, v1
 ; VI-NEXT:    v_readfirstlane_b32 s77, v0
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s31, 19
 ; VI-NEXT:    s_lshr_b32 s11, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -26498,9 +26500,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_lshr_b32 s73, s74, 16
 ; VI-NEXT:    s_lshr_b32 s76, s77, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -26666,6 +26666,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s55, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v20, 18
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -26686,26 +26687,25 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v17, s53
 ; VI-NEXT:    v_mov_b32_e32 v18, s54
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
-; VI-NEXT:    v_readlane_b32 s67, v20, 19
-; VI-NEXT:    v_readlane_b32 s66, v20, 18
-; VI-NEXT:    v_readlane_b32 s65, v20, 17
-; VI-NEXT:    v_readlane_b32 s64, v20, 16
-; VI-NEXT:    v_readlane_b32 s55, v20, 15
-; VI-NEXT:    v_readlane_b32 s54, v20, 14
-; VI-NEXT:    v_readlane_b32 s53, v20, 13
-; VI-NEXT:    v_readlane_b32 s52, v20, 12
-; VI-NEXT:    v_readlane_b32 s51, v20, 11
-; VI-NEXT:    v_readlane_b32 s50, v20, 10
-; VI-NEXT:    v_readlane_b32 s49, v20, 9
-; VI-NEXT:    v_readlane_b32 s48, v20, 8
-; VI-NEXT:    v_readlane_b32 s39, v20, 7
-; VI-NEXT:    v_readlane_b32 s38, v20, 6
-; VI-NEXT:    v_readlane_b32 s37, v20, 5
-; VI-NEXT:    v_readlane_b32 s36, v20, 4
-; VI-NEXT:    v_readlane_b32 s35, v20, 3
-; VI-NEXT:    v_readlane_b32 s34, v20, 2
-; VI-NEXT:    v_readlane_b32 s31, v20, 1
-; VI-NEXT:    v_readlane_b32 s30, v20, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 19
+; VI-NEXT:    v_readlane_b32 s67, v20, 17
+; VI-NEXT:    v_readlane_b32 s66, v20, 16
+; VI-NEXT:    v_readlane_b32 s65, v20, 15
+; VI-NEXT:    v_readlane_b32 s64, v20, 14
+; VI-NEXT:    v_readlane_b32 s55, v20, 13
+; VI-NEXT:    v_readlane_b32 s54, v20, 12
+; VI-NEXT:    v_readlane_b32 s53, v20, 11
+; VI-NEXT:    v_readlane_b32 s52, v20, 10
+; VI-NEXT:    v_readlane_b32 s51, v20, 9
+; VI-NEXT:    v_readlane_b32 s50, v20, 8
+; VI-NEXT:    v_readlane_b32 s49, v20, 7
+; VI-NEXT:    v_readlane_b32 s48, v20, 6
+; VI-NEXT:    v_readlane_b32 s39, v20, 5
+; VI-NEXT:    v_readlane_b32 s38, v20, 4
+; VI-NEXT:    v_readlane_b32 s37, v20, 3
+; VI-NEXT:    v_readlane_b32 s36, v20, 2
+; VI-NEXT:    v_readlane_b32 s35, v20, 1
+; VI-NEXT:    v_readlane_b32 s34, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26971,7 +26971,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
+define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27499,7 +27499,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v10f64_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28333,7 +28333,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) {
+define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29266,7 +29266,7 @@ end:
   ret <10 x double> %phi
 }
 
-define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v10f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29608,30 +29608,32 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s30, 18
 ; VI-NEXT:    v_readfirstlane_b32 s6, v5
 ; VI-NEXT:    v_readfirstlane_b32 s8, v4
 ; VI-NEXT:    v_readfirstlane_b32 s11, v3
 ; VI-NEXT:    v_readfirstlane_b32 s14, v2
 ; VI-NEXT:    v_readfirstlane_b32 s73, v1
 ; VI-NEXT:    v_readfirstlane_b32 s76, v0
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
+; VI-NEXT:    v_writelane_b32 v32, s31, 19
 ; VI-NEXT:    s_lshr_b32 s10, s29, 16
 ; VI-NEXT:    s_lshr_b32 s13, s28, 16
 ; VI-NEXT:    s_lshr_b32 s72, s27, 16
@@ -29653,9 +29655,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; VI-NEXT:    s_lshr_b32 s75, s73, 16
 ; VI-NEXT:    s_lshr_b32 s78, s76, 16
 ; VI-NEXT:    v_readfirstlane_b32 s4, v6
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
 ; VI-NEXT:    s_cbranch_scc0 .LBB55_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_and_b32 s4, 0xffff, s16
@@ -29839,26 +29839,26 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 18
+; VI-NEXT:    v_readlane_b32 s31, v32, 19
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30122,7 +30122,7 @@ end:
   ret <10 x double> %phi
 }
 
-define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
+define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30990,7 +30990,7 @@ end:
   ret <40 x half> %phi
 }
 
-define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40i16_to_v40f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30998,36 +30998,36 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v20, s30, 0
-; SI-NEXT:    v_writelane_b32 v20, s31, 1
-; SI-NEXT:    v_writelane_b32 v20, s34, 2
-; SI-NEXT:    v_writelane_b32 v20, s35, 3
-; SI-NEXT:    v_writelane_b32 v20, s36, 4
-; SI-NEXT:    v_writelane_b32 v20, s37, 5
-; SI-NEXT:    v_writelane_b32 v20, s38, 6
-; SI-NEXT:    v_writelane_b32 v20, s39, 7
-; SI-NEXT:    v_writelane_b32 v20, s48, 8
-; SI-NEXT:    v_writelane_b32 v20, s49, 9
-; SI-NEXT:    v_writelane_b32 v20, s50, 10
-; SI-NEXT:    v_writelane_b32 v20, s51, 11
-; SI-NEXT:    v_writelane_b32 v20, s52, 12
-; SI-NEXT:    v_writelane_b32 v20, s53, 13
-; SI-NEXT:    v_writelane_b32 v20, s54, 14
-; SI-NEXT:    v_writelane_b32 v20, s55, 15
-; SI-NEXT:    v_writelane_b32 v20, s64, 16
-; SI-NEXT:    v_writelane_b32 v20, s65, 17
-; SI-NEXT:    v_writelane_b32 v20, s66, 18
-; SI-NEXT:    v_writelane_b32 v20, s67, 19
-; SI-NEXT:    v_writelane_b32 v20, s68, 20
-; SI-NEXT:    v_writelane_b32 v20, s69, 21
-; SI-NEXT:    v_writelane_b32 v20, s70, 22
-; SI-NEXT:    v_writelane_b32 v20, s71, 23
+; SI-NEXT:    v_writelane_b32 v20, s34, 0
+; SI-NEXT:    v_writelane_b32 v20, s35, 1
+; SI-NEXT:    v_writelane_b32 v20, s36, 2
+; SI-NEXT:    v_writelane_b32 v20, s37, 3
+; SI-NEXT:    v_writelane_b32 v20, s38, 4
+; SI-NEXT:    v_writelane_b32 v20, s39, 5
+; SI-NEXT:    v_writelane_b32 v20, s48, 6
+; SI-NEXT:    v_writelane_b32 v20, s49, 7
+; SI-NEXT:    v_writelane_b32 v20, s50, 8
+; SI-NEXT:    v_writelane_b32 v20, s51, 9
+; SI-NEXT:    v_writelane_b32 v20, s52, 10
+; SI-NEXT:    v_writelane_b32 v20, s53, 11
+; SI-NEXT:    v_writelane_b32 v20, s54, 12
+; SI-NEXT:    v_writelane_b32 v20, s55, 13
+; SI-NEXT:    v_writelane_b32 v20, s64, 14
+; SI-NEXT:    v_writelane_b32 v20, s65, 15
+; SI-NEXT:    v_writelane_b32 v20, s66, 16
+; SI-NEXT:    v_writelane_b32 v20, s67, 17
+; SI-NEXT:    v_writelane_b32 v20, s68, 18
+; SI-NEXT:    v_writelane_b32 v20, s69, 19
+; SI-NEXT:    v_writelane_b32 v20, s70, 20
+; SI-NEXT:    v_writelane_b32 v20, s71, 21
+; SI-NEXT:    v_writelane_b32 v20, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s69, v5
 ; SI-NEXT:    v_readfirstlane_b32 s71, v4
 ; SI-NEXT:    v_readfirstlane_b32 s66, v3
 ; SI-NEXT:    v_readfirstlane_b32 s68, v2
 ; SI-NEXT:    v_readfirstlane_b32 s55, v1
 ; SI-NEXT:    v_readfirstlane_b32 s65, v0
+; SI-NEXT:    v_writelane_b32 v20, s31, 23
 ; SI-NEXT:    s_lshr_b32 s36, s29, 16
 ; SI-NEXT:    s_lshr_b32 s54, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -31315,6 +31315,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s24, s39, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s24
+; SI-NEXT:    v_readlane_b32 s30, v20, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -31335,30 +31336,29 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v17, s7
 ; SI-NEXT:    v_mov_b32_e32 v18, s4
 ; SI-NEXT:    v_mov_b32_e32 v19, s5
-; SI-NEXT:    v_readlane_b32 s71, v20, 23
-; SI-NEXT:    v_readlane_b32 s70, v20, 22
-; SI-NEXT:    v_readlane_b32 s69, v20, 21
-; SI-NEXT:    v_readlane_b32 s68, v20, 20
-; SI-NEXT:    v_readlane_b32 s67, v20, 19
-; SI-NEXT:    v_readlane_b32 s66, v20, 18
-; SI-NEXT:    v_readlane_b32 s65, v20, 17
-; SI-NEXT:    v_readlane_b32 s64, v20, 16
-; SI-NEXT:    v_readlane_b32 s55, v20, 15
-; SI-NEXT:    v_readlane_b32 s54, v20, 14
-; SI-NEXT:    v_readlane_b32 s53, v20, 13
-; SI-NEXT:    v_readlane_b32 s52, v20, 12
-; SI-NEXT:    v_readlane_b32 s51, v20, 11
-; SI-NEXT:    v_readlane_b32 s50, v20, 10
-; SI-NEXT:    v_readlane_b32 s49, v20, 9
-; SI-NEXT:    v_readlane_b32 s48, v20, 8
-; SI-NEXT:    v_readlane_b32 s39, v20, 7
-; SI-NEXT:    v_readlane_b32 s38, v20, 6
-; SI-NEXT:    v_readlane_b32 s37, v20, 5
-; SI-NEXT:    v_readlane_b32 s36, v20, 4
-; SI-NEXT:    v_readlane_b32 s35, v20, 3
-; SI-NEXT:    v_readlane_b32 s34, v20, 2
-; SI-NEXT:    v_readlane_b32 s31, v20, 1
-; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    v_readlane_b32 s31, v20, 23
+; SI-NEXT:    v_readlane_b32 s71, v20, 21
+; SI-NEXT:    v_readlane_b32 s70, v20, 20
+; SI-NEXT:    v_readlane_b32 s69, v20, 19
+; SI-NEXT:    v_readlane_b32 s68, v20, 18
+; SI-NEXT:    v_readlane_b32 s67, v20, 17
+; SI-NEXT:    v_readlane_b32 s66, v20, 16
+; SI-NEXT:    v_readlane_b32 s65, v20, 15
+; SI-NEXT:    v_readlane_b32 s64, v20, 14
+; SI-NEXT:    v_readlane_b32 s55, v20, 13
+; SI-NEXT:    v_readlane_b32 s54, v20, 12
+; SI-NEXT:    v_readlane_b32 s53, v20, 11
+; SI-NEXT:    v_readlane_b32 s52, v20, 10
+; SI-NEXT:    v_readlane_b32 s51, v20, 9
+; SI-NEXT:    v_readlane_b32 s50, v20, 8
+; SI-NEXT:    v_readlane_b32 s49, v20, 7
+; SI-NEXT:    v_readlane_b32 s48, v20, 6
+; SI-NEXT:    v_readlane_b32 s39, v20, 5
+; SI-NEXT:    v_readlane_b32 s38, v20, 4
+; SI-NEXT:    v_readlane_b32 s37, v20, 3
+; SI-NEXT:    v_readlane_b32 s36, v20, 2
+; SI-NEXT:    v_readlane_b32 s35, v20, 1
+; SI-NEXT:    v_readlane_b32 s34, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32047,7 +32047,7 @@ end:
   ret <40 x half> %phi
 }
 
-define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
+define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v40i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32704,7 +32704,7 @@ end:
   ret <40 x i16> %phi
 }
 
-define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33710,3 +33710,5 @@ end:
   %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <40 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 68bf611ec019e..77c047f492cac 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define double @bitcast_i64_to_f64(i64 %a, i32 %b) {
+define double @bitcast_i64_to_f64(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -81,7 +81,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -168,7 +168,7 @@ end:
   ret double %phi
 }
 
-define i64 @bitcast_f64_to_i64(double %a, i32 %b) {
+define i64 @bitcast_f64_to_i64(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -239,7 +239,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -325,7 +325,7 @@ end:
   ret i64 %phi
 }
 
-define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) {
+define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -400,7 +400,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -487,7 +487,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) {
+define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -561,7 +561,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,7 +648,7 @@ end:
   ret i64 %phi
 }
 
-define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) {
+define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -723,7 +723,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -810,7 +810,7 @@ end:
   ret <2 x float> %phi
 }
 
-define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) {
+define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -883,7 +883,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -973,7 +973,7 @@ end:
   ret i64 %phi
 }
 
-define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) {
+define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1062,7 +1062,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1161,7 +1161,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) {
+define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1269,7 +1269,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1382,7 +1382,7 @@ end:
   ret i64 %phi
 }
 
-define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) {
+define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1471,7 +1471,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1570,7 +1570,7 @@ end:
   ret <4 x half> %phi
 }
 
-define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) {
+define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1687,7 +1687,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1810,7 +1810,7 @@ end:
   ret i64 %phi
 }
 
-define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) {
+define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1909,7 +1909,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2014,7 +2014,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
+define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2279,7 +2279,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2562,7 +2562,7 @@ end:
   ret i64 %phi
 }
 
-define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) {
+define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2774,7 +2774,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_i64_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2954,7 +2954,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
+define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3287,7 +3287,7 @@ end:
   ret i64 %phi
 }
 
-define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3511,7 +3511,7 @@ end:
   ret i64 %phi
 }
 
-define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) {
+define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3582,7 +3582,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3668,7 +3668,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) {
+define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3742,7 +3742,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3829,7 +3829,7 @@ end:
   ret double %phi
 }
 
-define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) {
+define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3900,7 +3900,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3986,7 +3986,7 @@ end:
   ret <2 x float> %phi
 }
 
-define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) {
+define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4059,7 +4059,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4149,7 +4149,7 @@ end:
   ret double %phi
 }
 
-define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) {
+define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4234,7 +4234,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4335,7 +4335,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) {
+define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4443,7 +4443,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4556,7 +4556,7 @@ end:
   ret double %phi
 }
 
-define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) {
+define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4641,7 +4641,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4742,7 +4742,7 @@ end:
   ret <4 x half> %phi
 }
 
-define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) {
+define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4859,7 +4859,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4982,7 +4982,7 @@ end:
   ret double %phi
 }
 
-define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) {
+define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5076,7 +5076,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5185,7 +5185,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
+define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5450,7 +5450,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5733,7 +5733,7 @@ end:
   ret double %phi
 }
 
-define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) {
+define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5938,7 +5938,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_f64_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6132,7 +6132,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
+define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6465,7 +6465,7 @@ end:
   ret double %phi
 }
 
-define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6689,7 +6689,7 @@ end:
   ret double %phi
 }
 
-define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) {
+define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6763,7 +6763,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6850,7 +6850,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) {
+define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6923,7 +6923,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7013,7 +7013,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) {
+define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7101,7 +7101,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7200,7 +7200,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) {
+define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7308,7 +7308,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7421,7 +7421,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) {
+define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7509,7 +7509,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7608,7 +7608,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) {
+define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7725,7 +7725,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7848,7 +7848,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7946,7 +7946,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8051,7 +8051,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
+define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8316,7 +8316,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8599,7 +8599,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) {
+define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8809,7 +8809,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2i32_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8989,7 +8989,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
+define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9322,7 +9322,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v2i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9546,7 +9546,7 @@ end:
   ret <2 x i32> %phi
 }
 
-define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) {
+define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9633,7 +9633,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9738,7 +9738,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) {
+define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9846,7 +9846,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9959,7 +9959,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) {
+define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10046,7 +10046,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10151,7 +10151,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) {
+define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10268,7 +10268,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10391,7 +10391,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10488,7 +10488,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10601,7 +10601,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
+define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10866,7 +10866,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11149,7 +11149,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) {
+define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11357,7 +11357,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v2f32_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11555,7 +11555,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
+define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11888,7 +11888,7 @@ end:
   ret <2 x float> %phi
 }
 
-define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v2f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12112,7 +12112,7 @@ end:
   ret <2 x float> %phi
 }
 
-define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) {
+define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12224,7 +12224,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12348,7 +12348,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) {
+define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12452,7 +12452,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12578,7 +12578,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12688,7 +12688,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12812,7 +12812,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
+define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13077,7 +13077,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13358,7 +13358,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) {
+define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13595,7 +13595,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4i16_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13809,7 +13809,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
+define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14150,7 +14150,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14387,7 +14387,7 @@ end:
   ret <4 x i16> %phi
 }
 
-define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14506,7 +14506,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14641,7 +14641,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) {
+define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14917,7 +14917,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15213,7 +15213,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) {
+define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15454,7 +15454,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4f16_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15678,7 +15678,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
+define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16019,7 +16019,7 @@ end:
   ret <4 x half> %phi
 }
 
-define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16256,7 +16256,7 @@ end:
   ret <4 x half> %phi
 }
 
-define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
+define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16640,7 +16640,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v4bf16_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17038,7 +17038,7 @@ end:
   ret <8 x i8> %phi
 }
 
-define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17380,7 +17380,7 @@ end:
   ret <4 x bfloat> %phi
 }
 
-define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v8i8_to_v4bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17614,3 +17614,5 @@ end:
   %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <4 x bfloat> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index d2f29b856a013..f1c80ed5d2873 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) {
+define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -164,7 +164,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -433,7 +433,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) {
+define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -580,7 +580,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1076,7 +1076,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) {
+define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1234,7 +1234,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1503,7 +1503,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) {
+define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1667,7 +1667,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1936,7 +1936,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) {
+define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2094,7 +2094,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2363,7 +2363,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) {
+define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2477,7 +2477,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2929,7 +2929,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) {
+define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3555,7 +3555,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4264,7 +4264,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
+define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5212,7 +5212,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5220,23 +5220,25 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
 ; SI-NEXT:    v_readfirstlane_b32 s9, v6
 ; SI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -5245,7 +5247,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s75, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s90, v0
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
+; SI-NEXT:    v_writelane_b32 v22, s31, 19
 ; SI-NEXT:    s_lshr_b32 s14, s29, 16
 ; SI-NEXT:    s_lshr_b32 s73, s28, 16
 ; SI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -5269,9 +5271,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshr_b32 s77, s79, 16
 ; SI-NEXT:    s_lshr_b32 s89, s90, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB15_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -5453,6 +5453,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v22, 18
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -5475,26 +5476,25 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v19, s55
 ; SI-NEXT:    v_mov_b32_e32 v20, s56
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 19
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5510,29 +5510,29 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v22, s30, 0
-; VI-NEXT:    v_writelane_b32 v22, s31, 1
-; VI-NEXT:    v_writelane_b32 v22, s34, 2
-; VI-NEXT:    v_writelane_b32 v22, s35, 3
-; VI-NEXT:    v_writelane_b32 v22, s36, 4
-; VI-NEXT:    v_writelane_b32 v22, s37, 5
-; VI-NEXT:    v_writelane_b32 v22, s38, 6
-; VI-NEXT:    v_writelane_b32 v22, s39, 7
-; VI-NEXT:    v_writelane_b32 v22, s48, 8
-; VI-NEXT:    v_writelane_b32 v22, s49, 9
-; VI-NEXT:    v_writelane_b32 v22, s50, 10
-; VI-NEXT:    v_writelane_b32 v22, s51, 11
-; VI-NEXT:    v_writelane_b32 v22, s52, 12
-; VI-NEXT:    v_writelane_b32 v22, s53, 13
-; VI-NEXT:    v_writelane_b32 v22, s54, 14
-; VI-NEXT:    v_writelane_b32 v22, s55, 15
-; VI-NEXT:    v_writelane_b32 v22, s64, 16
-; VI-NEXT:    v_writelane_b32 v22, s65, 17
-; VI-NEXT:    v_writelane_b32 v22, s66, 18
-; VI-NEXT:    v_writelane_b32 v22, s67, 19
-; VI-NEXT:    v_writelane_b32 v22, s68, 20
-; VI-NEXT:    v_writelane_b32 v22, s69, 21
-; VI-NEXT:    v_writelane_b32 v22, s70, 22
+; VI-NEXT:    v_writelane_b32 v22, s34, 0
+; VI-NEXT:    v_writelane_b32 v22, s35, 1
+; VI-NEXT:    v_writelane_b32 v22, s36, 2
+; VI-NEXT:    v_writelane_b32 v22, s37, 3
+; VI-NEXT:    v_writelane_b32 v22, s38, 4
+; VI-NEXT:    v_writelane_b32 v22, s39, 5
+; VI-NEXT:    v_writelane_b32 v22, s48, 6
+; VI-NEXT:    v_writelane_b32 v22, s49, 7
+; VI-NEXT:    v_writelane_b32 v22, s50, 8
+; VI-NEXT:    v_writelane_b32 v22, s51, 9
+; VI-NEXT:    v_writelane_b32 v22, s52, 10
+; VI-NEXT:    v_writelane_b32 v22, s53, 11
+; VI-NEXT:    v_writelane_b32 v22, s54, 12
+; VI-NEXT:    v_writelane_b32 v22, s55, 13
+; VI-NEXT:    v_writelane_b32 v22, s64, 14
+; VI-NEXT:    v_writelane_b32 v22, s65, 15
+; VI-NEXT:    v_writelane_b32 v22, s66, 16
+; VI-NEXT:    v_writelane_b32 v22, s67, 17
+; VI-NEXT:    v_writelane_b32 v22, s68, 18
+; VI-NEXT:    v_writelane_b32 v22, s69, 19
+; VI-NEXT:    v_writelane_b32 v22, s70, 20
+; VI-NEXT:    v_writelane_b32 v22, s71, 21
+; VI-NEXT:    v_writelane_b32 v22, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s7, v7
 ; VI-NEXT:    v_readfirstlane_b32 s9, v6
 ; VI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -5541,7 +5541,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v22, s71, 23
+; VI-NEXT:    v_writelane_b32 v22, s31, 23
 ; VI-NEXT:    s_lshr_b32 s14, s29, 16
 ; VI-NEXT:    s_lshr_b32 s73, s28, 16
 ; VI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -5747,6 +5747,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v22, 22
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -5769,30 +5770,29 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
 ; VI-NEXT:    v_mov_b32_e32 v20, s56
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
-; VI-NEXT:    v_readlane_b32 s71, v22, 23
-; VI-NEXT:    v_readlane_b32 s70, v22, 22
-; VI-NEXT:    v_readlane_b32 s69, v22, 21
-; VI-NEXT:    v_readlane_b32 s68, v22, 20
-; VI-NEXT:    v_readlane_b32 s67, v22, 19
-; VI-NEXT:    v_readlane_b32 s66, v22, 18
-; VI-NEXT:    v_readlane_b32 s65, v22, 17
-; VI-NEXT:    v_readlane_b32 s64, v22, 16
-; VI-NEXT:    v_readlane_b32 s55, v22, 15
-; VI-NEXT:    v_readlane_b32 s54, v22, 14
-; VI-NEXT:    v_readlane_b32 s53, v22, 13
-; VI-NEXT:    v_readlane_b32 s52, v22, 12
-; VI-NEXT:    v_readlane_b32 s51, v22, 11
-; VI-NEXT:    v_readlane_b32 s50, v22, 10
-; VI-NEXT:    v_readlane_b32 s49, v22, 9
-; VI-NEXT:    v_readlane_b32 s48, v22, 8
-; VI-NEXT:    v_readlane_b32 s39, v22, 7
-; VI-NEXT:    v_readlane_b32 s38, v22, 6
-; VI-NEXT:    v_readlane_b32 s37, v22, 5
-; VI-NEXT:    v_readlane_b32 s36, v22, 4
-; VI-NEXT:    v_readlane_b32 s35, v22, 3
-; VI-NEXT:    v_readlane_b32 s34, v22, 2
-; VI-NEXT:    v_readlane_b32 s31, v22, 1
-; VI-NEXT:    v_readlane_b32 s30, v22, 0
+; VI-NEXT:    v_readlane_b32 s31, v22, 23
+; VI-NEXT:    v_readlane_b32 s71, v22, 21
+; VI-NEXT:    v_readlane_b32 s70, v22, 20
+; VI-NEXT:    v_readlane_b32 s69, v22, 19
+; VI-NEXT:    v_readlane_b32 s68, v22, 18
+; VI-NEXT:    v_readlane_b32 s67, v22, 17
+; VI-NEXT:    v_readlane_b32 s66, v22, 16
+; VI-NEXT:    v_readlane_b32 s65, v22, 15
+; VI-NEXT:    v_readlane_b32 s64, v22, 14
+; VI-NEXT:    v_readlane_b32 s55, v22, 13
+; VI-NEXT:    v_readlane_b32 s54, v22, 12
+; VI-NEXT:    v_readlane_b32 s53, v22, 11
+; VI-NEXT:    v_readlane_b32 s52, v22, 10
+; VI-NEXT:    v_readlane_b32 s51, v22, 9
+; VI-NEXT:    v_readlane_b32 s50, v22, 8
+; VI-NEXT:    v_readlane_b32 s49, v22, 7
+; VI-NEXT:    v_readlane_b32 s48, v22, 6
+; VI-NEXT:    v_readlane_b32 s39, v22, 5
+; VI-NEXT:    v_readlane_b32 s38, v22, 4
+; VI-NEXT:    v_readlane_b32 s37, v22, 3
+; VI-NEXT:    v_readlane_b32 s36, v22, 2
+; VI-NEXT:    v_readlane_b32 s35, v22, 1
+; VI-NEXT:    v_readlane_b32 s34, v22, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -6074,7 +6074,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
+define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6700,7 +6700,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22i32_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7409,7 +7409,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) {
+define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8456,7 +8456,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v22i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8464,23 +8464,25 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; SI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -8489,7 +8491,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s72, v2
 ; SI-NEXT:    v_readfirstlane_b32 s74, v1
 ; SI-NEXT:    v_readfirstlane_b32 s77, v0
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s31, 19
 ; SI-NEXT:    s_lshr_b32 s75, s29, 16
 ; SI-NEXT:    s_lshr_b32 s78, s28, 16
 ; SI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -8513,9 +8515,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    s_lshr_b32 s76, s74, 16
 ; SI-NEXT:    s_lshr_b32 s79, s77, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -8800,26 +8800,26 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 18
+; SI-NEXT:    v_readlane_b32 s31, v32, 19
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8832,29 +8832,29 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s6, v7
 ; VI-NEXT:    v_readfirstlane_b32 s8, v6
 ; VI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -8863,7 +8863,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s78, v1
 ; VI-NEXT:    v_readfirstlane_b32 s89, v0
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
+; VI-NEXT:    v_writelane_b32 v32, s31, 23
 ; VI-NEXT:    s_lshr_b32 s13, s29, 16
 ; VI-NEXT:    s_lshr_b32 s72, s28, 16
 ; VI-NEXT:    s_lshr_b32 s74, s27, 16
@@ -9085,30 +9085,30 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 22
+; VI-NEXT:    v_readlane_b32 s31, v32, 23
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9388,7 +9388,7 @@ end:
   ret <22 x i32> %phi
 }
 
-define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) {
+define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9535,7 +9535,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10031,7 +10031,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) {
+define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10195,7 +10195,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10464,7 +10464,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) {
+define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10611,7 +10611,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11107,7 +11107,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) {
+define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11221,7 +11221,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11673,7 +11673,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) {
+define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12277,7 +12277,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13247,7 +13247,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
+define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14195,7 +14195,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14203,23 +14203,25 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
 ; SI-NEXT:    v_readfirstlane_b32 s9, v6
 ; SI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -14228,7 +14230,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s75, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s90, v0
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
+; SI-NEXT:    v_writelane_b32 v22, s31, 19
 ; SI-NEXT:    s_lshr_b32 s14, s29, 16
 ; SI-NEXT:    s_lshr_b32 s73, s28, 16
 ; SI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -14252,9 +14254,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s77, s79, 16
 ; SI-NEXT:    s_lshr_b32 s89, s90, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB31_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -14436,6 +14436,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v22, 18
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -14458,26 +14459,25 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v19, s55
 ; SI-NEXT:    v_mov_b32_e32 v20, s56
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 19
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14493,29 +14493,29 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v22, s30, 0
-; VI-NEXT:    v_writelane_b32 v22, s31, 1
-; VI-NEXT:    v_writelane_b32 v22, s34, 2
-; VI-NEXT:    v_writelane_b32 v22, s35, 3
-; VI-NEXT:    v_writelane_b32 v22, s36, 4
-; VI-NEXT:    v_writelane_b32 v22, s37, 5
-; VI-NEXT:    v_writelane_b32 v22, s38, 6
-; VI-NEXT:    v_writelane_b32 v22, s39, 7
-; VI-NEXT:    v_writelane_b32 v22, s48, 8
-; VI-NEXT:    v_writelane_b32 v22, s49, 9
-; VI-NEXT:    v_writelane_b32 v22, s50, 10
-; VI-NEXT:    v_writelane_b32 v22, s51, 11
-; VI-NEXT:    v_writelane_b32 v22, s52, 12
-; VI-NEXT:    v_writelane_b32 v22, s53, 13
-; VI-NEXT:    v_writelane_b32 v22, s54, 14
-; VI-NEXT:    v_writelane_b32 v22, s55, 15
-; VI-NEXT:    v_writelane_b32 v22, s64, 16
-; VI-NEXT:    v_writelane_b32 v22, s65, 17
-; VI-NEXT:    v_writelane_b32 v22, s66, 18
-; VI-NEXT:    v_writelane_b32 v22, s67, 19
-; VI-NEXT:    v_writelane_b32 v22, s68, 20
-; VI-NEXT:    v_writelane_b32 v22, s69, 21
-; VI-NEXT:    v_writelane_b32 v22, s70, 22
+; VI-NEXT:    v_writelane_b32 v22, s34, 0
+; VI-NEXT:    v_writelane_b32 v22, s35, 1
+; VI-NEXT:    v_writelane_b32 v22, s36, 2
+; VI-NEXT:    v_writelane_b32 v22, s37, 3
+; VI-NEXT:    v_writelane_b32 v22, s38, 4
+; VI-NEXT:    v_writelane_b32 v22, s39, 5
+; VI-NEXT:    v_writelane_b32 v22, s48, 6
+; VI-NEXT:    v_writelane_b32 v22, s49, 7
+; VI-NEXT:    v_writelane_b32 v22, s50, 8
+; VI-NEXT:    v_writelane_b32 v22, s51, 9
+; VI-NEXT:    v_writelane_b32 v22, s52, 10
+; VI-NEXT:    v_writelane_b32 v22, s53, 11
+; VI-NEXT:    v_writelane_b32 v22, s54, 12
+; VI-NEXT:    v_writelane_b32 v22, s55, 13
+; VI-NEXT:    v_writelane_b32 v22, s64, 14
+; VI-NEXT:    v_writelane_b32 v22, s65, 15
+; VI-NEXT:    v_writelane_b32 v22, s66, 16
+; VI-NEXT:    v_writelane_b32 v22, s67, 17
+; VI-NEXT:    v_writelane_b32 v22, s68, 18
+; VI-NEXT:    v_writelane_b32 v22, s69, 19
+; VI-NEXT:    v_writelane_b32 v22, s70, 20
+; VI-NEXT:    v_writelane_b32 v22, s71, 21
+; VI-NEXT:    v_writelane_b32 v22, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s7, v7
 ; VI-NEXT:    v_readfirstlane_b32 s9, v6
 ; VI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -14524,7 +14524,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v22, s71, 23
+; VI-NEXT:    v_writelane_b32 v22, s31, 23
 ; VI-NEXT:    s_lshr_b32 s14, s29, 16
 ; VI-NEXT:    s_lshr_b32 s73, s28, 16
 ; VI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -14730,6 +14730,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v22, 22
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -14752,30 +14753,29 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
 ; VI-NEXT:    v_mov_b32_e32 v20, s56
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
-; VI-NEXT:    v_readlane_b32 s71, v22, 23
-; VI-NEXT:    v_readlane_b32 s70, v22, 22
-; VI-NEXT:    v_readlane_b32 s69, v22, 21
-; VI-NEXT:    v_readlane_b32 s68, v22, 20
-; VI-NEXT:    v_readlane_b32 s67, v22, 19
-; VI-NEXT:    v_readlane_b32 s66, v22, 18
-; VI-NEXT:    v_readlane_b32 s65, v22, 17
-; VI-NEXT:    v_readlane_b32 s64, v22, 16
-; VI-NEXT:    v_readlane_b32 s55, v22, 15
-; VI-NEXT:    v_readlane_b32 s54, v22, 14
-; VI-NEXT:    v_readlane_b32 s53, v22, 13
-; VI-NEXT:    v_readlane_b32 s52, v22, 12
-; VI-NEXT:    v_readlane_b32 s51, v22, 11
-; VI-NEXT:    v_readlane_b32 s50, v22, 10
-; VI-NEXT:    v_readlane_b32 s49, v22, 9
-; VI-NEXT:    v_readlane_b32 s48, v22, 8
-; VI-NEXT:    v_readlane_b32 s39, v22, 7
-; VI-NEXT:    v_readlane_b32 s38, v22, 6
-; VI-NEXT:    v_readlane_b32 s37, v22, 5
-; VI-NEXT:    v_readlane_b32 s36, v22, 4
-; VI-NEXT:    v_readlane_b32 s35, v22, 3
-; VI-NEXT:    v_readlane_b32 s34, v22, 2
-; VI-NEXT:    v_readlane_b32 s31, v22, 1
-; VI-NEXT:    v_readlane_b32 s30, v22, 0
+; VI-NEXT:    v_readlane_b32 s31, v22, 23
+; VI-NEXT:    v_readlane_b32 s71, v22, 21
+; VI-NEXT:    v_readlane_b32 s70, v22, 20
+; VI-NEXT:    v_readlane_b32 s69, v22, 19
+; VI-NEXT:    v_readlane_b32 s68, v22, 18
+; VI-NEXT:    v_readlane_b32 s67, v22, 17
+; VI-NEXT:    v_readlane_b32 s66, v22, 16
+; VI-NEXT:    v_readlane_b32 s65, v22, 15
+; VI-NEXT:    v_readlane_b32 s64, v22, 14
+; VI-NEXT:    v_readlane_b32 s55, v22, 13
+; VI-NEXT:    v_readlane_b32 s54, v22, 12
+; VI-NEXT:    v_readlane_b32 s53, v22, 11
+; VI-NEXT:    v_readlane_b32 s52, v22, 10
+; VI-NEXT:    v_readlane_b32 s51, v22, 9
+; VI-NEXT:    v_readlane_b32 s50, v22, 8
+; VI-NEXT:    v_readlane_b32 s49, v22, 7
+; VI-NEXT:    v_readlane_b32 s48, v22, 6
+; VI-NEXT:    v_readlane_b32 s39, v22, 5
+; VI-NEXT:    v_readlane_b32 s38, v22, 4
+; VI-NEXT:    v_readlane_b32 s37, v22, 3
+; VI-NEXT:    v_readlane_b32 s36, v22, 2
+; VI-NEXT:    v_readlane_b32 s35, v22, 1
+; VI-NEXT:    v_readlane_b32 s34, v22, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15057,7 +15057,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
+define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15661,7 +15661,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v22f32_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16631,7 +16631,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) {
+define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17678,7 +17678,7 @@ end:
   ret <22 x float> %phi
 }
 
-define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v22f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17686,23 +17686,25 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; SI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -17711,7 +17713,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s72, v2
 ; SI-NEXT:    v_readfirstlane_b32 s74, v1
 ; SI-NEXT:    v_readfirstlane_b32 s77, v0
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s31, 19
 ; SI-NEXT:    s_lshr_b32 s75, s29, 16
 ; SI-NEXT:    s_lshr_b32 s78, s28, 16
 ; SI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -17735,9 +17737,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s76, s74, 16
 ; SI-NEXT:    s_lshr_b32 s79, s77, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -18022,26 +18022,26 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 18
+; SI-NEXT:    v_readlane_b32 s31, v32, 19
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18054,29 +18054,29 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s6, v7
 ; VI-NEXT:    v_readfirstlane_b32 s8, v6
 ; VI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -18085,7 +18085,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s78, v1
 ; VI-NEXT:    v_readfirstlane_b32 s89, v0
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
+; VI-NEXT:    v_writelane_b32 v32, s31, 23
 ; VI-NEXT:    s_lshr_b32 s13, s29, 16
 ; VI-NEXT:    s_lshr_b32 s72, s28, 16
 ; VI-NEXT:    s_lshr_b32 s74, s27, 16
@@ -18307,30 +18307,30 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 22
+; VI-NEXT:    v_readlane_b32 s31, v32, 23
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18610,7 +18610,7 @@ end:
   ret <22 x float> %phi
 }
 
-define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) {
+define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18774,7 +18774,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19042,7 +19042,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) {
+define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19156,7 +19156,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19608,7 +19608,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) {
+define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20246,7 +20246,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20955,7 +20955,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
+define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21903,7 +21903,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21911,23 +21911,25 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
 ; SI-NEXT:    v_readfirstlane_b32 s9, v6
 ; SI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -21936,7 +21938,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s75, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s90, v0
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
+; SI-NEXT:    v_writelane_b32 v22, s31, 19
 ; SI-NEXT:    s_lshr_b32 s14, s29, 16
 ; SI-NEXT:    s_lshr_b32 s73, s28, 16
 ; SI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -21960,9 +21962,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshr_b32 s77, s79, 16
 ; SI-NEXT:    s_lshr_b32 s89, s90, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -22144,6 +22144,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v22, 18
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -22166,26 +22167,25 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v19, s55
 ; SI-NEXT:    v_mov_b32_e32 v20, s56
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 19
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22201,29 +22201,29 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v22, s30, 0
-; VI-NEXT:    v_writelane_b32 v22, s31, 1
-; VI-NEXT:    v_writelane_b32 v22, s34, 2
-; VI-NEXT:    v_writelane_b32 v22, s35, 3
-; VI-NEXT:    v_writelane_b32 v22, s36, 4
-; VI-NEXT:    v_writelane_b32 v22, s37, 5
-; VI-NEXT:    v_writelane_b32 v22, s38, 6
-; VI-NEXT:    v_writelane_b32 v22, s39, 7
-; VI-NEXT:    v_writelane_b32 v22, s48, 8
-; VI-NEXT:    v_writelane_b32 v22, s49, 9
-; VI-NEXT:    v_writelane_b32 v22, s50, 10
-; VI-NEXT:    v_writelane_b32 v22, s51, 11
-; VI-NEXT:    v_writelane_b32 v22, s52, 12
-; VI-NEXT:    v_writelane_b32 v22, s53, 13
-; VI-NEXT:    v_writelane_b32 v22, s54, 14
-; VI-NEXT:    v_writelane_b32 v22, s55, 15
-; VI-NEXT:    v_writelane_b32 v22, s64, 16
-; VI-NEXT:    v_writelane_b32 v22, s65, 17
-; VI-NEXT:    v_writelane_b32 v22, s66, 18
-; VI-NEXT:    v_writelane_b32 v22, s67, 19
-; VI-NEXT:    v_writelane_b32 v22, s68, 20
-; VI-NEXT:    v_writelane_b32 v22, s69, 21
-; VI-NEXT:    v_writelane_b32 v22, s70, 22
+; VI-NEXT:    v_writelane_b32 v22, s34, 0
+; VI-NEXT:    v_writelane_b32 v22, s35, 1
+; VI-NEXT:    v_writelane_b32 v22, s36, 2
+; VI-NEXT:    v_writelane_b32 v22, s37, 3
+; VI-NEXT:    v_writelane_b32 v22, s38, 4
+; VI-NEXT:    v_writelane_b32 v22, s39, 5
+; VI-NEXT:    v_writelane_b32 v22, s48, 6
+; VI-NEXT:    v_writelane_b32 v22, s49, 7
+; VI-NEXT:    v_writelane_b32 v22, s50, 8
+; VI-NEXT:    v_writelane_b32 v22, s51, 9
+; VI-NEXT:    v_writelane_b32 v22, s52, 10
+; VI-NEXT:    v_writelane_b32 v22, s53, 11
+; VI-NEXT:    v_writelane_b32 v22, s54, 12
+; VI-NEXT:    v_writelane_b32 v22, s55, 13
+; VI-NEXT:    v_writelane_b32 v22, s64, 14
+; VI-NEXT:    v_writelane_b32 v22, s65, 15
+; VI-NEXT:    v_writelane_b32 v22, s66, 16
+; VI-NEXT:    v_writelane_b32 v22, s67, 17
+; VI-NEXT:    v_writelane_b32 v22, s68, 18
+; VI-NEXT:    v_writelane_b32 v22, s69, 19
+; VI-NEXT:    v_writelane_b32 v22, s70, 20
+; VI-NEXT:    v_writelane_b32 v22, s71, 21
+; VI-NEXT:    v_writelane_b32 v22, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s7, v7
 ; VI-NEXT:    v_readfirstlane_b32 s9, v6
 ; VI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -22232,7 +22232,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v22, s71, 23
+; VI-NEXT:    v_writelane_b32 v22, s31, 23
 ; VI-NEXT:    s_lshr_b32 s14, s29, 16
 ; VI-NEXT:    s_lshr_b32 s73, s28, 16
 ; VI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -22438,6 +22438,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v22, 22
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -22460,30 +22461,29 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
 ; VI-NEXT:    v_mov_b32_e32 v20, s56
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
-; VI-NEXT:    v_readlane_b32 s71, v22, 23
-; VI-NEXT:    v_readlane_b32 s70, v22, 22
-; VI-NEXT:    v_readlane_b32 s69, v22, 21
-; VI-NEXT:    v_readlane_b32 s68, v22, 20
-; VI-NEXT:    v_readlane_b32 s67, v22, 19
-; VI-NEXT:    v_readlane_b32 s66, v22, 18
-; VI-NEXT:    v_readlane_b32 s65, v22, 17
-; VI-NEXT:    v_readlane_b32 s64, v22, 16
-; VI-NEXT:    v_readlane_b32 s55, v22, 15
-; VI-NEXT:    v_readlane_b32 s54, v22, 14
-; VI-NEXT:    v_readlane_b32 s53, v22, 13
-; VI-NEXT:    v_readlane_b32 s52, v22, 12
-; VI-NEXT:    v_readlane_b32 s51, v22, 11
-; VI-NEXT:    v_readlane_b32 s50, v22, 10
-; VI-NEXT:    v_readlane_b32 s49, v22, 9
-; VI-NEXT:    v_readlane_b32 s48, v22, 8
-; VI-NEXT:    v_readlane_b32 s39, v22, 7
-; VI-NEXT:    v_readlane_b32 s38, v22, 6
-; VI-NEXT:    v_readlane_b32 s37, v22, 5
-; VI-NEXT:    v_readlane_b32 s36, v22, 4
-; VI-NEXT:    v_readlane_b32 s35, v22, 3
-; VI-NEXT:    v_readlane_b32 s34, v22, 2
-; VI-NEXT:    v_readlane_b32 s31, v22, 1
-; VI-NEXT:    v_readlane_b32 s30, v22, 0
+; VI-NEXT:    v_readlane_b32 s31, v22, 23
+; VI-NEXT:    v_readlane_b32 s71, v22, 21
+; VI-NEXT:    v_readlane_b32 s70, v22, 20
+; VI-NEXT:    v_readlane_b32 s69, v22, 19
+; VI-NEXT:    v_readlane_b32 s68, v22, 18
+; VI-NEXT:    v_readlane_b32 s67, v22, 17
+; VI-NEXT:    v_readlane_b32 s66, v22, 16
+; VI-NEXT:    v_readlane_b32 s65, v22, 15
+; VI-NEXT:    v_readlane_b32 s64, v22, 14
+; VI-NEXT:    v_readlane_b32 s55, v22, 13
+; VI-NEXT:    v_readlane_b32 s54, v22, 12
+; VI-NEXT:    v_readlane_b32 s53, v22, 11
+; VI-NEXT:    v_readlane_b32 s52, v22, 10
+; VI-NEXT:    v_readlane_b32 s51, v22, 9
+; VI-NEXT:    v_readlane_b32 s50, v22, 8
+; VI-NEXT:    v_readlane_b32 s49, v22, 7
+; VI-NEXT:    v_readlane_b32 s48, v22, 6
+; VI-NEXT:    v_readlane_b32 s39, v22, 5
+; VI-NEXT:    v_readlane_b32 s38, v22, 4
+; VI-NEXT:    v_readlane_b32 s37, v22, 3
+; VI-NEXT:    v_readlane_b32 s36, v22, 2
+; VI-NEXT:    v_readlane_b32 s35, v22, 1
+; VI-NEXT:    v_readlane_b32 s34, v22, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22765,7 +22765,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
+define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23403,7 +23403,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11i64_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24112,7 +24112,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) {
+define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25159,7 +25159,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v11i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25167,23 +25167,25 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; SI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -25192,7 +25194,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s72, v2
 ; SI-NEXT:    v_readfirstlane_b32 s74, v1
 ; SI-NEXT:    v_readfirstlane_b32 s77, v0
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s31, 19
 ; SI-NEXT:    s_lshr_b32 s75, s29, 16
 ; SI-NEXT:    s_lshr_b32 s78, s28, 16
 ; SI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -25216,9 +25218,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    s_lshr_b32 s76, s74, 16
 ; SI-NEXT:    s_lshr_b32 s79, s77, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -25503,26 +25503,26 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 18
+; SI-NEXT:    v_readlane_b32 s31, v32, 19
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25535,29 +25535,29 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s6, v7
 ; VI-NEXT:    v_readfirstlane_b32 s8, v6
 ; VI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -25566,7 +25566,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s78, v1
 ; VI-NEXT:    v_readfirstlane_b32 s89, v0
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
+; VI-NEXT:    v_writelane_b32 v32, s31, 23
 ; VI-NEXT:    s_lshr_b32 s13, s29, 16
 ; VI-NEXT:    s_lshr_b32 s72, s28, 16
 ; VI-NEXT:    s_lshr_b32 s74, s27, 16
@@ -25788,30 +25788,30 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 22
+; VI-NEXT:    v_readlane_b32 s31, v32, 23
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26091,7 +26091,7 @@ end:
   ret <11 x i64> %phi
 }
 
-define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) {
+define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26662,7 +26662,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27577,7 +27577,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
+define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28525,7 +28525,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28533,23 +28533,25 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
 ; SI-NEXT:    v_readfirstlane_b32 s9, v6
 ; SI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -28558,7 +28560,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s75, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s90, v0
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
+; SI-NEXT:    v_writelane_b32 v22, s31, 19
 ; SI-NEXT:    s_lshr_b32 s14, s29, 16
 ; SI-NEXT:    s_lshr_b32 s73, s28, 16
 ; SI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -28582,9 +28584,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s77, s79, 16
 ; SI-NEXT:    s_lshr_b32 s89, s90, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -28766,6 +28766,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v22, 18
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -28788,26 +28789,25 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v19, s55
 ; SI-NEXT:    v_mov_b32_e32 v20, s56
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 19
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28823,29 +28823,29 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v22, s30, 0
-; VI-NEXT:    v_writelane_b32 v22, s31, 1
-; VI-NEXT:    v_writelane_b32 v22, s34, 2
-; VI-NEXT:    v_writelane_b32 v22, s35, 3
-; VI-NEXT:    v_writelane_b32 v22, s36, 4
-; VI-NEXT:    v_writelane_b32 v22, s37, 5
-; VI-NEXT:    v_writelane_b32 v22, s38, 6
-; VI-NEXT:    v_writelane_b32 v22, s39, 7
-; VI-NEXT:    v_writelane_b32 v22, s48, 8
-; VI-NEXT:    v_writelane_b32 v22, s49, 9
-; VI-NEXT:    v_writelane_b32 v22, s50, 10
-; VI-NEXT:    v_writelane_b32 v22, s51, 11
-; VI-NEXT:    v_writelane_b32 v22, s52, 12
-; VI-NEXT:    v_writelane_b32 v22, s53, 13
-; VI-NEXT:    v_writelane_b32 v22, s54, 14
-; VI-NEXT:    v_writelane_b32 v22, s55, 15
-; VI-NEXT:    v_writelane_b32 v22, s64, 16
-; VI-NEXT:    v_writelane_b32 v22, s65, 17
-; VI-NEXT:    v_writelane_b32 v22, s66, 18
-; VI-NEXT:    v_writelane_b32 v22, s67, 19
-; VI-NEXT:    v_writelane_b32 v22, s68, 20
-; VI-NEXT:    v_writelane_b32 v22, s69, 21
-; VI-NEXT:    v_writelane_b32 v22, s70, 22
+; VI-NEXT:    v_writelane_b32 v22, s34, 0
+; VI-NEXT:    v_writelane_b32 v22, s35, 1
+; VI-NEXT:    v_writelane_b32 v22, s36, 2
+; VI-NEXT:    v_writelane_b32 v22, s37, 3
+; VI-NEXT:    v_writelane_b32 v22, s38, 4
+; VI-NEXT:    v_writelane_b32 v22, s39, 5
+; VI-NEXT:    v_writelane_b32 v22, s48, 6
+; VI-NEXT:    v_writelane_b32 v22, s49, 7
+; VI-NEXT:    v_writelane_b32 v22, s50, 8
+; VI-NEXT:    v_writelane_b32 v22, s51, 9
+; VI-NEXT:    v_writelane_b32 v22, s52, 10
+; VI-NEXT:    v_writelane_b32 v22, s53, 11
+; VI-NEXT:    v_writelane_b32 v22, s54, 12
+; VI-NEXT:    v_writelane_b32 v22, s55, 13
+; VI-NEXT:    v_writelane_b32 v22, s64, 14
+; VI-NEXT:    v_writelane_b32 v22, s65, 15
+; VI-NEXT:    v_writelane_b32 v22, s66, 16
+; VI-NEXT:    v_writelane_b32 v22, s67, 17
+; VI-NEXT:    v_writelane_b32 v22, s68, 18
+; VI-NEXT:    v_writelane_b32 v22, s69, 19
+; VI-NEXT:    v_writelane_b32 v22, s70, 20
+; VI-NEXT:    v_writelane_b32 v22, s71, 21
+; VI-NEXT:    v_writelane_b32 v22, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s7, v7
 ; VI-NEXT:    v_readfirstlane_b32 s9, v6
 ; VI-NEXT:    v_readfirstlane_b32 s11, v5
@@ -28854,7 +28854,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v22, s71, 23
+; VI-NEXT:    v_writelane_b32 v22, s31, 23
 ; VI-NEXT:    s_lshr_b32 s14, s29, 16
 ; VI-NEXT:    s_lshr_b32 s73, s28, 16
 ; VI-NEXT:    s_lshr_b32 s76, s27, 16
@@ -29060,6 +29060,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s57, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v22, 22
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -29082,30 +29083,29 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v19, s55
 ; VI-NEXT:    v_mov_b32_e32 v20, s56
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
-; VI-NEXT:    v_readlane_b32 s71, v22, 23
-; VI-NEXT:    v_readlane_b32 s70, v22, 22
-; VI-NEXT:    v_readlane_b32 s69, v22, 21
-; VI-NEXT:    v_readlane_b32 s68, v22, 20
-; VI-NEXT:    v_readlane_b32 s67, v22, 19
-; VI-NEXT:    v_readlane_b32 s66, v22, 18
-; VI-NEXT:    v_readlane_b32 s65, v22, 17
-; VI-NEXT:    v_readlane_b32 s64, v22, 16
-; VI-NEXT:    v_readlane_b32 s55, v22, 15
-; VI-NEXT:    v_readlane_b32 s54, v22, 14
-; VI-NEXT:    v_readlane_b32 s53, v22, 13
-; VI-NEXT:    v_readlane_b32 s52, v22, 12
-; VI-NEXT:    v_readlane_b32 s51, v22, 11
-; VI-NEXT:    v_readlane_b32 s50, v22, 10
-; VI-NEXT:    v_readlane_b32 s49, v22, 9
-; VI-NEXT:    v_readlane_b32 s48, v22, 8
-; VI-NEXT:    v_readlane_b32 s39, v22, 7
-; VI-NEXT:    v_readlane_b32 s38, v22, 6
-; VI-NEXT:    v_readlane_b32 s37, v22, 5
-; VI-NEXT:    v_readlane_b32 s36, v22, 4
-; VI-NEXT:    v_readlane_b32 s35, v22, 3
-; VI-NEXT:    v_readlane_b32 s34, v22, 2
-; VI-NEXT:    v_readlane_b32 s31, v22, 1
-; VI-NEXT:    v_readlane_b32 s30, v22, 0
+; VI-NEXT:    v_readlane_b32 s31, v22, 23
+; VI-NEXT:    v_readlane_b32 s71, v22, 21
+; VI-NEXT:    v_readlane_b32 s70, v22, 20
+; VI-NEXT:    v_readlane_b32 s69, v22, 19
+; VI-NEXT:    v_readlane_b32 s68, v22, 18
+; VI-NEXT:    v_readlane_b32 s67, v22, 17
+; VI-NEXT:    v_readlane_b32 s66, v22, 16
+; VI-NEXT:    v_readlane_b32 s65, v22, 15
+; VI-NEXT:    v_readlane_b32 s64, v22, 14
+; VI-NEXT:    v_readlane_b32 s55, v22, 13
+; VI-NEXT:    v_readlane_b32 s54, v22, 12
+; VI-NEXT:    v_readlane_b32 s53, v22, 11
+; VI-NEXT:    v_readlane_b32 s52, v22, 10
+; VI-NEXT:    v_readlane_b32 s51, v22, 9
+; VI-NEXT:    v_readlane_b32 s50, v22, 8
+; VI-NEXT:    v_readlane_b32 s49, v22, 7
+; VI-NEXT:    v_readlane_b32 s48, v22, 6
+; VI-NEXT:    v_readlane_b32 s39, v22, 5
+; VI-NEXT:    v_readlane_b32 s38, v22, 4
+; VI-NEXT:    v_readlane_b32 s37, v22, 3
+; VI-NEXT:    v_readlane_b32 s36, v22, 2
+; VI-NEXT:    v_readlane_b32 s35, v22, 1
+; VI-NEXT:    v_readlane_b32 s34, v22, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29387,7 +29387,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
+define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29958,7 +29958,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v11f64_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30873,7 +30873,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) {
+define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31920,7 +31920,7 @@ end:
   ret <11 x double> %phi
 }
 
-define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v11f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31928,23 +31928,25 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s30, 18
 ; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; SI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -31953,7 +31955,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s72, v2
 ; SI-NEXT:    v_readfirstlane_b32 s74, v1
 ; SI-NEXT:    v_readfirstlane_b32 s77, v0
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
+; SI-NEXT:    v_writelane_b32 v32, s31, 19
 ; SI-NEXT:    s_lshr_b32 s75, s29, 16
 ; SI-NEXT:    s_lshr_b32 s78, s28, 16
 ; SI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -31977,9 +31979,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    s_lshr_b32 s76, s74, 16
 ; SI-NEXT:    s_lshr_b32 s79, s77, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -32264,26 +32264,26 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 18
+; SI-NEXT:    v_readlane_b32 s31, v32, 19
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32296,29 +32296,29 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s30, 22
 ; VI-NEXT:    v_readfirstlane_b32 s6, v7
 ; VI-NEXT:    v_readfirstlane_b32 s8, v6
 ; VI-NEXT:    v_readfirstlane_b32 s10, v5
@@ -32327,7 +32327,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s75, v2
 ; VI-NEXT:    v_readfirstlane_b32 s78, v1
 ; VI-NEXT:    v_readfirstlane_b32 s89, v0
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
+; VI-NEXT:    v_writelane_b32 v32, s31, 23
 ; VI-NEXT:    s_lshr_b32 s13, s29, 16
 ; VI-NEXT:    s_lshr_b32 s72, s28, 16
 ; VI-NEXT:    s_lshr_b32 s74, s27, 16
@@ -32549,30 +32549,30 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 22
+; VI-NEXT:    v_readlane_b32 s31, v32, 23
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32852,7 +32852,7 @@ end:
   ret <11 x double> %phi
 }
 
-define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
+define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v44f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33834,7 +33834,7 @@ end:
   ret <44 x half> %phi
 }
 
-define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44i16_to_v44f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33842,36 +33842,35 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v22, s30, 0
-; SI-NEXT:    v_writelane_b32 v22, s31, 1
-; SI-NEXT:    v_writelane_b32 v22, s34, 2
-; SI-NEXT:    v_writelane_b32 v22, s35, 3
-; SI-NEXT:    v_writelane_b32 v22, s36, 4
-; SI-NEXT:    v_writelane_b32 v22, s37, 5
-; SI-NEXT:    v_writelane_b32 v22, s38, 6
-; SI-NEXT:    v_writelane_b32 v22, s39, 7
-; SI-NEXT:    v_writelane_b32 v22, s48, 8
-; SI-NEXT:    v_writelane_b32 v22, s49, 9
-; SI-NEXT:    v_writelane_b32 v22, s50, 10
-; SI-NEXT:    v_writelane_b32 v22, s51, 11
-; SI-NEXT:    v_writelane_b32 v22, s52, 12
-; SI-NEXT:    v_writelane_b32 v22, s53, 13
-; SI-NEXT:    v_writelane_b32 v22, s54, 14
-; SI-NEXT:    v_writelane_b32 v22, s55, 15
-; SI-NEXT:    v_writelane_b32 v22, s64, 16
-; SI-NEXT:    v_writelane_b32 v22, s65, 17
-; SI-NEXT:    v_writelane_b32 v22, s66, 18
-; SI-NEXT:    v_writelane_b32 v22, s67, 19
-; SI-NEXT:    v_writelane_b32 v22, s68, 20
-; SI-NEXT:    v_writelane_b32 v22, s69, 21
-; SI-NEXT:    v_writelane_b32 v22, s70, 22
-; SI-NEXT:    v_writelane_b32 v22, s71, 23
-; SI-NEXT:    v_writelane_b32 v22, s80, 24
-; SI-NEXT:    v_writelane_b32 v22, s81, 25
-; SI-NEXT:    v_writelane_b32 v22, s82, 26
-; SI-NEXT:    v_writelane_b32 v22, s83, 27
-; SI-NEXT:    v_writelane_b32 v22, s84, 28
-; SI-NEXT:    v_writelane_b32 v22, s85, 29
+; SI-NEXT:    v_writelane_b32 v22, s34, 0
+; SI-NEXT:    v_writelane_b32 v22, s35, 1
+; SI-NEXT:    v_writelane_b32 v22, s36, 2
+; SI-NEXT:    v_writelane_b32 v22, s37, 3
+; SI-NEXT:    v_writelane_b32 v22, s38, 4
+; SI-NEXT:    v_writelane_b32 v22, s39, 5
+; SI-NEXT:    v_writelane_b32 v22, s48, 6
+; SI-NEXT:    v_writelane_b32 v22, s49, 7
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    v_writelane_b32 v22, s52, 10
+; SI-NEXT:    v_writelane_b32 v22, s53, 11
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    v_writelane_b32 v22, s64, 14
+; SI-NEXT:    v_writelane_b32 v22, s65, 15
+; SI-NEXT:    v_writelane_b32 v22, s66, 16
+; SI-NEXT:    v_writelane_b32 v22, s67, 17
+; SI-NEXT:    v_writelane_b32 v22, s68, 18
+; SI-NEXT:    v_writelane_b32 v22, s69, 19
+; SI-NEXT:    v_writelane_b32 v22, s70, 20
+; SI-NEXT:    v_writelane_b32 v22, s71, 21
+; SI-NEXT:    v_writelane_b32 v22, s80, 22
+; SI-NEXT:    v_writelane_b32 v22, s81, 23
+; SI-NEXT:    v_writelane_b32 v22, s82, 24
+; SI-NEXT:    v_writelane_b32 v22, s83, 25
+; SI-NEXT:    v_writelane_b32 v22, s84, 26
+; SI-NEXT:    v_writelane_b32 v22, s85, 27
+; SI-NEXT:    v_writelane_b32 v22, s30, 28
 ; SI-NEXT:    v_readfirstlane_b32 s83, v7
 ; SI-NEXT:    v_readfirstlane_b32 s85, v6
 ; SI-NEXT:    v_readfirstlane_b32 s80, v5
@@ -33880,6 +33879,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s71, v2
 ; SI-NEXT:    v_readfirstlane_b32 s66, v1
 ; SI-NEXT:    v_readfirstlane_b32 s68, v0
+; SI-NEXT:    v_writelane_b32 v22, s31, 29
 ; SI-NEXT:    s_lshr_b32 s38, s29, 16
 ; SI-NEXT:    s_lshr_b32 s65, s28, 16
 ; SI-NEXT:    s_lshr_b32 s37, s27, 16
@@ -34195,6 +34195,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s15, s15, 0xffff
 ; SI-NEXT:    s_lshl_b32 s26, s50, 16
 ; SI-NEXT:    s_or_b32 s15, s15, s26
+; SI-NEXT:    v_readlane_b32 s30, v22, 28
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v1, s13
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
@@ -34217,36 +34218,35 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v19, s25
 ; SI-NEXT:    v_mov_b32_e32 v20, s14
 ; SI-NEXT:    v_mov_b32_e32 v21, s15
-; SI-NEXT:    v_readlane_b32 s85, v22, 29
-; SI-NEXT:    v_readlane_b32 s84, v22, 28
-; SI-NEXT:    v_readlane_b32 s83, v22, 27
-; SI-NEXT:    v_readlane_b32 s82, v22, 26
-; SI-NEXT:    v_readlane_b32 s81, v22, 25
-; SI-NEXT:    v_readlane_b32 s80, v22, 24
-; SI-NEXT:    v_readlane_b32 s71, v22, 23
-; SI-NEXT:    v_readlane_b32 s70, v22, 22
-; SI-NEXT:    v_readlane_b32 s69, v22, 21
-; SI-NEXT:    v_readlane_b32 s68, v22, 20
-; SI-NEXT:    v_readlane_b32 s67, v22, 19
-; SI-NEXT:    v_readlane_b32 s66, v22, 18
-; SI-NEXT:    v_readlane_b32 s65, v22, 17
-; SI-NEXT:    v_readlane_b32 s64, v22, 16
-; SI-NEXT:    v_readlane_b32 s55, v22, 15
-; SI-NEXT:    v_readlane_b32 s54, v22, 14
-; SI-NEXT:    v_readlane_b32 s53, v22, 13
-; SI-NEXT:    v_readlane_b32 s52, v22, 12
-; SI-NEXT:    v_readlane_b32 s51, v22, 11
-; SI-NEXT:    v_readlane_b32 s50, v22, 10
-; SI-NEXT:    v_readlane_b32 s49, v22, 9
-; SI-NEXT:    v_readlane_b32 s48, v22, 8
-; SI-NEXT:    v_readlane_b32 s39, v22, 7
-; SI-NEXT:    v_readlane_b32 s38, v22, 6
-; SI-NEXT:    v_readlane_b32 s37, v22, 5
-; SI-NEXT:    v_readlane_b32 s36, v22, 4
-; SI-NEXT:    v_readlane_b32 s35, v22, 3
-; SI-NEXT:    v_readlane_b32 s34, v22, 2
-; SI-NEXT:    v_readlane_b32 s31, v22, 1
-; SI-NEXT:    v_readlane_b32 s30, v22, 0
+; SI-NEXT:    v_readlane_b32 s31, v22, 29
+; SI-NEXT:    v_readlane_b32 s85, v22, 27
+; SI-NEXT:    v_readlane_b32 s84, v22, 26
+; SI-NEXT:    v_readlane_b32 s83, v22, 25
+; SI-NEXT:    v_readlane_b32 s82, v22, 24
+; SI-NEXT:    v_readlane_b32 s81, v22, 23
+; SI-NEXT:    v_readlane_b32 s80, v22, 22
+; SI-NEXT:    v_readlane_b32 s71, v22, 21
+; SI-NEXT:    v_readlane_b32 s70, v22, 20
+; SI-NEXT:    v_readlane_b32 s69, v22, 19
+; SI-NEXT:    v_readlane_b32 s68, v22, 18
+; SI-NEXT:    v_readlane_b32 s67, v22, 17
+; SI-NEXT:    v_readlane_b32 s66, v22, 16
+; SI-NEXT:    v_readlane_b32 s65, v22, 15
+; SI-NEXT:    v_readlane_b32 s64, v22, 14
+; SI-NEXT:    v_readlane_b32 s55, v22, 13
+; SI-NEXT:    v_readlane_b32 s54, v22, 12
+; SI-NEXT:    v_readlane_b32 s53, v22, 11
+; SI-NEXT:    v_readlane_b32 s52, v22, 10
+; SI-NEXT:    v_readlane_b32 s51, v22, 9
+; SI-NEXT:    v_readlane_b32 s50, v22, 8
+; SI-NEXT:    v_readlane_b32 s49, v22, 7
+; SI-NEXT:    v_readlane_b32 s48, v22, 6
+; SI-NEXT:    v_readlane_b32 s39, v22, 5
+; SI-NEXT:    v_readlane_b32 s38, v22, 4
+; SI-NEXT:    v_readlane_b32 s37, v22, 3
+; SI-NEXT:    v_readlane_b32 s36, v22, 2
+; SI-NEXT:    v_readlane_b32 s35, v22, 1
+; SI-NEXT:    v_readlane_b32 s34, v22, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35001,7 +35001,7 @@ end:
   ret <44 x half> %phi
 }
 
-define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
+define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v44i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35715,7 +35715,7 @@ end:
   ret <44 x i16> %phi
 }
 
-define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v44f16_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36832,3 +36832,5 @@ end:
   %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <44 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index ba58d8a9fba52..1194fa2305563 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) {
+define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -172,7 +172,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,7 +464,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) {
+define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -618,7 +618,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1130,7 +1130,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) {
+define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1296,7 +1296,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1588,7 +1588,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) {
+define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1760,7 +1760,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2052,7 +2052,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) {
+define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2218,7 +2218,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2510,7 +2510,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) {
+define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2628,7 +2628,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3092,7 +3092,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) {
+define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3766,7 +3766,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3774,10 +3774,10 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
+; SI-NEXT:    v_writelane_b32 v24, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -3789,7 +3789,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
+; SI-NEXT:    v_writelane_b32 v24, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s88, s5, 16
@@ -3939,6 +3939,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s14, s88, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s29
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v24, 2
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
@@ -3963,10 +3964,9 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v21, s7
 ; SI-NEXT:    v_mov_b32_e32 v22, s4
 ; SI-NEXT:    v_mov_b32_e32 v23, s5
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 3
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4554,7 +4554,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
+define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5601,7 +5601,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5609,29 +5609,29 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
 ; SI-NEXT:    v_readfirstlane_b32 s9, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -5642,7 +5642,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
+; SI-NEXT:    v_writelane_b32 v24, s31, 23
 ; SI-NEXT:    s_lshr_b32 s72, s29, 16
 ; SI-NEXT:    s_lshr_b32 s75, s28, 16
 ; SI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -5866,6 +5866,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v24, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -5890,30 +5891,29 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
 ; SI-NEXT:    v_mov_b32_e32 v22, s58
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 23
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5929,33 +5929,33 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v24, s30, 0
-; VI-NEXT:    v_writelane_b32 v24, s31, 1
-; VI-NEXT:    v_writelane_b32 v24, s34, 2
-; VI-NEXT:    v_writelane_b32 v24, s35, 3
-; VI-NEXT:    v_writelane_b32 v24, s36, 4
-; VI-NEXT:    v_writelane_b32 v24, s37, 5
-; VI-NEXT:    v_writelane_b32 v24, s38, 6
-; VI-NEXT:    v_writelane_b32 v24, s39, 7
-; VI-NEXT:    v_writelane_b32 v24, s48, 8
-; VI-NEXT:    v_writelane_b32 v24, s49, 9
-; VI-NEXT:    v_writelane_b32 v24, s50, 10
-; VI-NEXT:    v_writelane_b32 v24, s51, 11
-; VI-NEXT:    v_writelane_b32 v24, s52, 12
-; VI-NEXT:    v_writelane_b32 v24, s53, 13
-; VI-NEXT:    v_writelane_b32 v24, s54, 14
-; VI-NEXT:    v_writelane_b32 v24, s55, 15
-; VI-NEXT:    v_writelane_b32 v24, s64, 16
-; VI-NEXT:    v_writelane_b32 v24, s65, 17
-; VI-NEXT:    v_writelane_b32 v24, s66, 18
-; VI-NEXT:    v_writelane_b32 v24, s67, 19
-; VI-NEXT:    v_writelane_b32 v24, s68, 20
-; VI-NEXT:    v_writelane_b32 v24, s69, 21
-; VI-NEXT:    v_writelane_b32 v24, s70, 22
-; VI-NEXT:    v_writelane_b32 v24, s71, 23
-; VI-NEXT:    v_writelane_b32 v24, s80, 24
-; VI-NEXT:    v_writelane_b32 v24, s81, 25
-; VI-NEXT:    v_writelane_b32 v24, s82, 26
+; VI-NEXT:    v_writelane_b32 v24, s34, 0
+; VI-NEXT:    v_writelane_b32 v24, s35, 1
+; VI-NEXT:    v_writelane_b32 v24, s36, 2
+; VI-NEXT:    v_writelane_b32 v24, s37, 3
+; VI-NEXT:    v_writelane_b32 v24, s38, 4
+; VI-NEXT:    v_writelane_b32 v24, s39, 5
+; VI-NEXT:    v_writelane_b32 v24, s48, 6
+; VI-NEXT:    v_writelane_b32 v24, s49, 7
+; VI-NEXT:    v_writelane_b32 v24, s50, 8
+; VI-NEXT:    v_writelane_b32 v24, s51, 9
+; VI-NEXT:    v_writelane_b32 v24, s52, 10
+; VI-NEXT:    v_writelane_b32 v24, s53, 11
+; VI-NEXT:    v_writelane_b32 v24, s54, 12
+; VI-NEXT:    v_writelane_b32 v24, s55, 13
+; VI-NEXT:    v_writelane_b32 v24, s64, 14
+; VI-NEXT:    v_writelane_b32 v24, s65, 15
+; VI-NEXT:    v_writelane_b32 v24, s66, 16
+; VI-NEXT:    v_writelane_b32 v24, s67, 17
+; VI-NEXT:    v_writelane_b32 v24, s68, 18
+; VI-NEXT:    v_writelane_b32 v24, s69, 19
+; VI-NEXT:    v_writelane_b32 v24, s70, 20
+; VI-NEXT:    v_writelane_b32 v24, s71, 21
+; VI-NEXT:    v_writelane_b32 v24, s80, 22
+; VI-NEXT:    v_writelane_b32 v24, s81, 23
+; VI-NEXT:    v_writelane_b32 v24, s82, 24
+; VI-NEXT:    v_writelane_b32 v24, s83, 25
+; VI-NEXT:    v_writelane_b32 v24, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s7, v9
 ; VI-NEXT:    v_readfirstlane_b32 s9, v8
 ; VI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -5966,7 +5966,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s88, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v24, s83, 27
+; VI-NEXT:    v_writelane_b32 v24, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -6190,6 +6190,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v24, 26
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -6214,34 +6215,33 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
 ; VI-NEXT:    v_mov_b32_e32 v22, s58
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
-; VI-NEXT:    v_readlane_b32 s83, v24, 27
-; VI-NEXT:    v_readlane_b32 s82, v24, 26
-; VI-NEXT:    v_readlane_b32 s81, v24, 25
-; VI-NEXT:    v_readlane_b32 s80, v24, 24
-; VI-NEXT:    v_readlane_b32 s71, v24, 23
-; VI-NEXT:    v_readlane_b32 s70, v24, 22
-; VI-NEXT:    v_readlane_b32 s69, v24, 21
-; VI-NEXT:    v_readlane_b32 s68, v24, 20
-; VI-NEXT:    v_readlane_b32 s67, v24, 19
-; VI-NEXT:    v_readlane_b32 s66, v24, 18
-; VI-NEXT:    v_readlane_b32 s65, v24, 17
-; VI-NEXT:    v_readlane_b32 s64, v24, 16
-; VI-NEXT:    v_readlane_b32 s55, v24, 15
-; VI-NEXT:    v_readlane_b32 s54, v24, 14
-; VI-NEXT:    v_readlane_b32 s53, v24, 13
-; VI-NEXT:    v_readlane_b32 s52, v24, 12
-; VI-NEXT:    v_readlane_b32 s51, v24, 11
-; VI-NEXT:    v_readlane_b32 s50, v24, 10
-; VI-NEXT:    v_readlane_b32 s49, v24, 9
-; VI-NEXT:    v_readlane_b32 s48, v24, 8
-; VI-NEXT:    v_readlane_b32 s39, v24, 7
-; VI-NEXT:    v_readlane_b32 s38, v24, 6
-; VI-NEXT:    v_readlane_b32 s37, v24, 5
-; VI-NEXT:    v_readlane_b32 s36, v24, 4
-; VI-NEXT:    v_readlane_b32 s35, v24, 3
-; VI-NEXT:    v_readlane_b32 s34, v24, 2
-; VI-NEXT:    v_readlane_b32 s31, v24, 1
-; VI-NEXT:    v_readlane_b32 s30, v24, 0
+; VI-NEXT:    v_readlane_b32 s31, v24, 27
+; VI-NEXT:    v_readlane_b32 s83, v24, 25
+; VI-NEXT:    v_readlane_b32 s82, v24, 24
+; VI-NEXT:    v_readlane_b32 s81, v24, 23
+; VI-NEXT:    v_readlane_b32 s80, v24, 22
+; VI-NEXT:    v_readlane_b32 s71, v24, 21
+; VI-NEXT:    v_readlane_b32 s70, v24, 20
+; VI-NEXT:    v_readlane_b32 s69, v24, 19
+; VI-NEXT:    v_readlane_b32 s68, v24, 18
+; VI-NEXT:    v_readlane_b32 s67, v24, 17
+; VI-NEXT:    v_readlane_b32 s66, v24, 16
+; VI-NEXT:    v_readlane_b32 s65, v24, 15
+; VI-NEXT:    v_readlane_b32 s64, v24, 14
+; VI-NEXT:    v_readlane_b32 s55, v24, 13
+; VI-NEXT:    v_readlane_b32 s54, v24, 12
+; VI-NEXT:    v_readlane_b32 s53, v24, 11
+; VI-NEXT:    v_readlane_b32 s52, v24, 10
+; VI-NEXT:    v_readlane_b32 s51, v24, 9
+; VI-NEXT:    v_readlane_b32 s50, v24, 8
+; VI-NEXT:    v_readlane_b32 s49, v24, 7
+; VI-NEXT:    v_readlane_b32 s48, v24, 6
+; VI-NEXT:    v_readlane_b32 s39, v24, 5
+; VI-NEXT:    v_readlane_b32 s38, v24, 4
+; VI-NEXT:    v_readlane_b32 s37, v24, 3
+; VI-NEXT:    v_readlane_b32 s36, v24, 2
+; VI-NEXT:    v_readlane_b32 s35, v24, 1
+; VI-NEXT:    v_readlane_b32 s34, v24, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -6539,7 +6539,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
+define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7213,7 +7213,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24i32_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7221,10 +7221,10 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
+; SI-NEXT:    v_writelane_b32 v24, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -7236,7 +7236,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
+; SI-NEXT:    v_writelane_b32 v24, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s88, s5, 16
@@ -7386,6 +7386,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s14, s88, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s29
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v24, 2
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
@@ -7410,10 +7411,9 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v21, s7
 ; SI-NEXT:    v_mov_b32_e32 v22, s4
 ; SI-NEXT:    v_mov_b32_e32 v23, s5
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 3
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8001,7 +8001,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
+define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9160,7 +9160,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v24i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9168,29 +9168,29 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s6, v9
 ; SI-NEXT:    v_readfirstlane_b32 s8, v8
 ; SI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -9201,7 +9201,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s76, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s89, v0
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s31, 23
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s90, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -9534,30 +9534,30 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 22
+; SI-NEXT:    v_readlane_b32 s31, v32, 23
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9570,33 +9570,33 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s6, v9
 ; VI-NEXT:    v_readfirstlane_b32 s8, v8
 ; VI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -9607,7 +9607,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s79, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s74, s28, 16
 ; VI-NEXT:    s_lshr_b32 s77, s27, 16
@@ -9845,34 +9845,34 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 26
+; VI-NEXT:    v_readlane_b32 s31, v32, 27
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10168,7 +10168,7 @@ end:
   ret <24 x i32> %phi
 }
 
-define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) {
+define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10322,7 +10322,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10834,7 +10834,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) {
+define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11006,7 +11006,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11298,7 +11298,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) {
+define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11452,7 +11452,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11964,7 +11964,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) {
+define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12082,7 +12082,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12546,7 +12546,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) {
+define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13196,7 +13196,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13204,10 +13204,10 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -13219,7 +13219,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s35, s5, 16
@@ -13433,6 +13433,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v36
+; SI-NEXT:    v_readlane_b32 s30, v40, 2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v35
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v34
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v33
@@ -13445,10 +13446,9 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v25
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 3
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14268,7 +14268,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
+define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15315,7 +15315,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15323,29 +15323,29 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
 ; SI-NEXT:    v_readfirstlane_b32 s9, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -15356,7 +15356,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
+; SI-NEXT:    v_writelane_b32 v24, s31, 23
 ; SI-NEXT:    s_lshr_b32 s72, s29, 16
 ; SI-NEXT:    s_lshr_b32 s75, s28, 16
 ; SI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -15580,6 +15580,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v24, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -15604,30 +15605,29 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
 ; SI-NEXT:    v_mov_b32_e32 v22, s58
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 23
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15643,33 +15643,33 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v24, s30, 0
-; VI-NEXT:    v_writelane_b32 v24, s31, 1
-; VI-NEXT:    v_writelane_b32 v24, s34, 2
-; VI-NEXT:    v_writelane_b32 v24, s35, 3
-; VI-NEXT:    v_writelane_b32 v24, s36, 4
-; VI-NEXT:    v_writelane_b32 v24, s37, 5
-; VI-NEXT:    v_writelane_b32 v24, s38, 6
-; VI-NEXT:    v_writelane_b32 v24, s39, 7
-; VI-NEXT:    v_writelane_b32 v24, s48, 8
-; VI-NEXT:    v_writelane_b32 v24, s49, 9
-; VI-NEXT:    v_writelane_b32 v24, s50, 10
-; VI-NEXT:    v_writelane_b32 v24, s51, 11
-; VI-NEXT:    v_writelane_b32 v24, s52, 12
-; VI-NEXT:    v_writelane_b32 v24, s53, 13
-; VI-NEXT:    v_writelane_b32 v24, s54, 14
-; VI-NEXT:    v_writelane_b32 v24, s55, 15
-; VI-NEXT:    v_writelane_b32 v24, s64, 16
-; VI-NEXT:    v_writelane_b32 v24, s65, 17
-; VI-NEXT:    v_writelane_b32 v24, s66, 18
-; VI-NEXT:    v_writelane_b32 v24, s67, 19
-; VI-NEXT:    v_writelane_b32 v24, s68, 20
-; VI-NEXT:    v_writelane_b32 v24, s69, 21
-; VI-NEXT:    v_writelane_b32 v24, s70, 22
-; VI-NEXT:    v_writelane_b32 v24, s71, 23
-; VI-NEXT:    v_writelane_b32 v24, s80, 24
-; VI-NEXT:    v_writelane_b32 v24, s81, 25
-; VI-NEXT:    v_writelane_b32 v24, s82, 26
+; VI-NEXT:    v_writelane_b32 v24, s34, 0
+; VI-NEXT:    v_writelane_b32 v24, s35, 1
+; VI-NEXT:    v_writelane_b32 v24, s36, 2
+; VI-NEXT:    v_writelane_b32 v24, s37, 3
+; VI-NEXT:    v_writelane_b32 v24, s38, 4
+; VI-NEXT:    v_writelane_b32 v24, s39, 5
+; VI-NEXT:    v_writelane_b32 v24, s48, 6
+; VI-NEXT:    v_writelane_b32 v24, s49, 7
+; VI-NEXT:    v_writelane_b32 v24, s50, 8
+; VI-NEXT:    v_writelane_b32 v24, s51, 9
+; VI-NEXT:    v_writelane_b32 v24, s52, 10
+; VI-NEXT:    v_writelane_b32 v24, s53, 11
+; VI-NEXT:    v_writelane_b32 v24, s54, 12
+; VI-NEXT:    v_writelane_b32 v24, s55, 13
+; VI-NEXT:    v_writelane_b32 v24, s64, 14
+; VI-NEXT:    v_writelane_b32 v24, s65, 15
+; VI-NEXT:    v_writelane_b32 v24, s66, 16
+; VI-NEXT:    v_writelane_b32 v24, s67, 17
+; VI-NEXT:    v_writelane_b32 v24, s68, 18
+; VI-NEXT:    v_writelane_b32 v24, s69, 19
+; VI-NEXT:    v_writelane_b32 v24, s70, 20
+; VI-NEXT:    v_writelane_b32 v24, s71, 21
+; VI-NEXT:    v_writelane_b32 v24, s80, 22
+; VI-NEXT:    v_writelane_b32 v24, s81, 23
+; VI-NEXT:    v_writelane_b32 v24, s82, 24
+; VI-NEXT:    v_writelane_b32 v24, s83, 25
+; VI-NEXT:    v_writelane_b32 v24, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s7, v9
 ; VI-NEXT:    v_readfirstlane_b32 s9, v8
 ; VI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -15680,7 +15680,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s88, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v24, s83, 27
+; VI-NEXT:    v_writelane_b32 v24, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -15904,6 +15904,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v24, 26
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -15928,34 +15929,33 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
 ; VI-NEXT:    v_mov_b32_e32 v22, s58
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
-; VI-NEXT:    v_readlane_b32 s83, v24, 27
-; VI-NEXT:    v_readlane_b32 s82, v24, 26
-; VI-NEXT:    v_readlane_b32 s81, v24, 25
-; VI-NEXT:    v_readlane_b32 s80, v24, 24
-; VI-NEXT:    v_readlane_b32 s71, v24, 23
-; VI-NEXT:    v_readlane_b32 s70, v24, 22
-; VI-NEXT:    v_readlane_b32 s69, v24, 21
-; VI-NEXT:    v_readlane_b32 s68, v24, 20
-; VI-NEXT:    v_readlane_b32 s67, v24, 19
-; VI-NEXT:    v_readlane_b32 s66, v24, 18
-; VI-NEXT:    v_readlane_b32 s65, v24, 17
-; VI-NEXT:    v_readlane_b32 s64, v24, 16
-; VI-NEXT:    v_readlane_b32 s55, v24, 15
-; VI-NEXT:    v_readlane_b32 s54, v24, 14
-; VI-NEXT:    v_readlane_b32 s53, v24, 13
-; VI-NEXT:    v_readlane_b32 s52, v24, 12
-; VI-NEXT:    v_readlane_b32 s51, v24, 11
-; VI-NEXT:    v_readlane_b32 s50, v24, 10
-; VI-NEXT:    v_readlane_b32 s49, v24, 9
-; VI-NEXT:    v_readlane_b32 s48, v24, 8
-; VI-NEXT:    v_readlane_b32 s39, v24, 7
-; VI-NEXT:    v_readlane_b32 s38, v24, 6
-; VI-NEXT:    v_readlane_b32 s37, v24, 5
-; VI-NEXT:    v_readlane_b32 s36, v24, 4
-; VI-NEXT:    v_readlane_b32 s35, v24, 3
-; VI-NEXT:    v_readlane_b32 s34, v24, 2
-; VI-NEXT:    v_readlane_b32 s31, v24, 1
-; VI-NEXT:    v_readlane_b32 s30, v24, 0
+; VI-NEXT:    v_readlane_b32 s31, v24, 27
+; VI-NEXT:    v_readlane_b32 s83, v24, 25
+; VI-NEXT:    v_readlane_b32 s82, v24, 24
+; VI-NEXT:    v_readlane_b32 s81, v24, 23
+; VI-NEXT:    v_readlane_b32 s80, v24, 22
+; VI-NEXT:    v_readlane_b32 s71, v24, 21
+; VI-NEXT:    v_readlane_b32 s70, v24, 20
+; VI-NEXT:    v_readlane_b32 s69, v24, 19
+; VI-NEXT:    v_readlane_b32 s68, v24, 18
+; VI-NEXT:    v_readlane_b32 s67, v24, 17
+; VI-NEXT:    v_readlane_b32 s66, v24, 16
+; VI-NEXT:    v_readlane_b32 s65, v24, 15
+; VI-NEXT:    v_readlane_b32 s64, v24, 14
+; VI-NEXT:    v_readlane_b32 s55, v24, 13
+; VI-NEXT:    v_readlane_b32 s54, v24, 12
+; VI-NEXT:    v_readlane_b32 s53, v24, 11
+; VI-NEXT:    v_readlane_b32 s52, v24, 10
+; VI-NEXT:    v_readlane_b32 s51, v24, 9
+; VI-NEXT:    v_readlane_b32 s50, v24, 8
+; VI-NEXT:    v_readlane_b32 s49, v24, 7
+; VI-NEXT:    v_readlane_b32 s48, v24, 6
+; VI-NEXT:    v_readlane_b32 s39, v24, 5
+; VI-NEXT:    v_readlane_b32 s38, v24, 4
+; VI-NEXT:    v_readlane_b32 s37, v24, 3
+; VI-NEXT:    v_readlane_b32 s36, v24, 2
+; VI-NEXT:    v_readlane_b32 s35, v24, 1
+; VI-NEXT:    v_readlane_b32 s34, v24, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16253,7 +16253,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
+define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16903,7 +16903,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v24f32_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16911,10 +16911,10 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -16926,7 +16926,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s35, s5, 16
@@ -17140,6 +17140,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v36
+; SI-NEXT:    v_readlane_b32 s30, v40, 2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v35
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v34
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v33
@@ -17152,10 +17153,9 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v25
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 3
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17975,7 +17975,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
+define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19134,7 +19134,7 @@ end:
   ret <24 x float> %phi
 }
 
-define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v24f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19142,29 +19142,29 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s6, v9
 ; SI-NEXT:    v_readfirstlane_b32 s8, v8
 ; SI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -19175,7 +19175,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s76, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s89, v0
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s31, 23
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s90, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -19508,30 +19508,30 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 22
+; SI-NEXT:    v_readlane_b32 s31, v32, 23
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19544,33 +19544,33 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s6, v9
 ; VI-NEXT:    v_readfirstlane_b32 s8, v8
 ; VI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -19581,7 +19581,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s79, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s74, s28, 16
 ; VI-NEXT:    s_lshr_b32 s77, s27, 16
@@ -19819,34 +19819,34 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 26
+; VI-NEXT:    v_readlane_b32 s31, v32, 27
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20142,7 +20142,7 @@ end:
   ret <24 x float> %phi
 }
 
-define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) {
+define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20314,7 +20314,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20605,7 +20605,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) {
+define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20723,7 +20723,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21187,7 +21187,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) {
+define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21873,7 +21873,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21881,10 +21881,10 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
+; SI-NEXT:    v_writelane_b32 v24, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -21896,7 +21896,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
+; SI-NEXT:    v_writelane_b32 v24, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s88, s5, 16
@@ -22046,6 +22046,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s14, s88, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s29
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v24, 2
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
@@ -22070,10 +22071,9 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v21, s7
 ; SI-NEXT:    v_mov_b32_e32 v22, s4
 ; SI-NEXT:    v_mov_b32_e32 v23, s5
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 3
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22661,7 +22661,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
+define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23708,7 +23708,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23716,29 +23716,29 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
 ; SI-NEXT:    v_readfirstlane_b32 s9, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -23749,7 +23749,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
+; SI-NEXT:    v_writelane_b32 v24, s31, 23
 ; SI-NEXT:    s_lshr_b32 s72, s29, 16
 ; SI-NEXT:    s_lshr_b32 s75, s28, 16
 ; SI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -23973,6 +23973,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v24, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -23997,30 +23998,29 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
 ; SI-NEXT:    v_mov_b32_e32 v22, s58
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 23
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24036,33 +24036,33 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v24, s30, 0
-; VI-NEXT:    v_writelane_b32 v24, s31, 1
-; VI-NEXT:    v_writelane_b32 v24, s34, 2
-; VI-NEXT:    v_writelane_b32 v24, s35, 3
-; VI-NEXT:    v_writelane_b32 v24, s36, 4
-; VI-NEXT:    v_writelane_b32 v24, s37, 5
-; VI-NEXT:    v_writelane_b32 v24, s38, 6
-; VI-NEXT:    v_writelane_b32 v24, s39, 7
-; VI-NEXT:    v_writelane_b32 v24, s48, 8
-; VI-NEXT:    v_writelane_b32 v24, s49, 9
-; VI-NEXT:    v_writelane_b32 v24, s50, 10
-; VI-NEXT:    v_writelane_b32 v24, s51, 11
-; VI-NEXT:    v_writelane_b32 v24, s52, 12
-; VI-NEXT:    v_writelane_b32 v24, s53, 13
-; VI-NEXT:    v_writelane_b32 v24, s54, 14
-; VI-NEXT:    v_writelane_b32 v24, s55, 15
-; VI-NEXT:    v_writelane_b32 v24, s64, 16
-; VI-NEXT:    v_writelane_b32 v24, s65, 17
-; VI-NEXT:    v_writelane_b32 v24, s66, 18
-; VI-NEXT:    v_writelane_b32 v24, s67, 19
-; VI-NEXT:    v_writelane_b32 v24, s68, 20
-; VI-NEXT:    v_writelane_b32 v24, s69, 21
-; VI-NEXT:    v_writelane_b32 v24, s70, 22
-; VI-NEXT:    v_writelane_b32 v24, s71, 23
-; VI-NEXT:    v_writelane_b32 v24, s80, 24
-; VI-NEXT:    v_writelane_b32 v24, s81, 25
-; VI-NEXT:    v_writelane_b32 v24, s82, 26
+; VI-NEXT:    v_writelane_b32 v24, s34, 0
+; VI-NEXT:    v_writelane_b32 v24, s35, 1
+; VI-NEXT:    v_writelane_b32 v24, s36, 2
+; VI-NEXT:    v_writelane_b32 v24, s37, 3
+; VI-NEXT:    v_writelane_b32 v24, s38, 4
+; VI-NEXT:    v_writelane_b32 v24, s39, 5
+; VI-NEXT:    v_writelane_b32 v24, s48, 6
+; VI-NEXT:    v_writelane_b32 v24, s49, 7
+; VI-NEXT:    v_writelane_b32 v24, s50, 8
+; VI-NEXT:    v_writelane_b32 v24, s51, 9
+; VI-NEXT:    v_writelane_b32 v24, s52, 10
+; VI-NEXT:    v_writelane_b32 v24, s53, 11
+; VI-NEXT:    v_writelane_b32 v24, s54, 12
+; VI-NEXT:    v_writelane_b32 v24, s55, 13
+; VI-NEXT:    v_writelane_b32 v24, s64, 14
+; VI-NEXT:    v_writelane_b32 v24, s65, 15
+; VI-NEXT:    v_writelane_b32 v24, s66, 16
+; VI-NEXT:    v_writelane_b32 v24, s67, 17
+; VI-NEXT:    v_writelane_b32 v24, s68, 18
+; VI-NEXT:    v_writelane_b32 v24, s69, 19
+; VI-NEXT:    v_writelane_b32 v24, s70, 20
+; VI-NEXT:    v_writelane_b32 v24, s71, 21
+; VI-NEXT:    v_writelane_b32 v24, s80, 22
+; VI-NEXT:    v_writelane_b32 v24, s81, 23
+; VI-NEXT:    v_writelane_b32 v24, s82, 24
+; VI-NEXT:    v_writelane_b32 v24, s83, 25
+; VI-NEXT:    v_writelane_b32 v24, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s7, v9
 ; VI-NEXT:    v_readfirstlane_b32 s9, v8
 ; VI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -24073,7 +24073,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s88, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v24, s83, 27
+; VI-NEXT:    v_writelane_b32 v24, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -24297,6 +24297,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v24, 26
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -24321,34 +24322,33 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
 ; VI-NEXT:    v_mov_b32_e32 v22, s58
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
-; VI-NEXT:    v_readlane_b32 s83, v24, 27
-; VI-NEXT:    v_readlane_b32 s82, v24, 26
-; VI-NEXT:    v_readlane_b32 s81, v24, 25
-; VI-NEXT:    v_readlane_b32 s80, v24, 24
-; VI-NEXT:    v_readlane_b32 s71, v24, 23
-; VI-NEXT:    v_readlane_b32 s70, v24, 22
-; VI-NEXT:    v_readlane_b32 s69, v24, 21
-; VI-NEXT:    v_readlane_b32 s68, v24, 20
-; VI-NEXT:    v_readlane_b32 s67, v24, 19
-; VI-NEXT:    v_readlane_b32 s66, v24, 18
-; VI-NEXT:    v_readlane_b32 s65, v24, 17
-; VI-NEXT:    v_readlane_b32 s64, v24, 16
-; VI-NEXT:    v_readlane_b32 s55, v24, 15
-; VI-NEXT:    v_readlane_b32 s54, v24, 14
-; VI-NEXT:    v_readlane_b32 s53, v24, 13
-; VI-NEXT:    v_readlane_b32 s52, v24, 12
-; VI-NEXT:    v_readlane_b32 s51, v24, 11
-; VI-NEXT:    v_readlane_b32 s50, v24, 10
-; VI-NEXT:    v_readlane_b32 s49, v24, 9
-; VI-NEXT:    v_readlane_b32 s48, v24, 8
-; VI-NEXT:    v_readlane_b32 s39, v24, 7
-; VI-NEXT:    v_readlane_b32 s38, v24, 6
-; VI-NEXT:    v_readlane_b32 s37, v24, 5
-; VI-NEXT:    v_readlane_b32 s36, v24, 4
-; VI-NEXT:    v_readlane_b32 s35, v24, 3
-; VI-NEXT:    v_readlane_b32 s34, v24, 2
-; VI-NEXT:    v_readlane_b32 s31, v24, 1
-; VI-NEXT:    v_readlane_b32 s30, v24, 0
+; VI-NEXT:    v_readlane_b32 s31, v24, 27
+; VI-NEXT:    v_readlane_b32 s83, v24, 25
+; VI-NEXT:    v_readlane_b32 s82, v24, 24
+; VI-NEXT:    v_readlane_b32 s81, v24, 23
+; VI-NEXT:    v_readlane_b32 s80, v24, 22
+; VI-NEXT:    v_readlane_b32 s71, v24, 21
+; VI-NEXT:    v_readlane_b32 s70, v24, 20
+; VI-NEXT:    v_readlane_b32 s69, v24, 19
+; VI-NEXT:    v_readlane_b32 s68, v24, 18
+; VI-NEXT:    v_readlane_b32 s67, v24, 17
+; VI-NEXT:    v_readlane_b32 s66, v24, 16
+; VI-NEXT:    v_readlane_b32 s65, v24, 15
+; VI-NEXT:    v_readlane_b32 s64, v24, 14
+; VI-NEXT:    v_readlane_b32 s55, v24, 13
+; VI-NEXT:    v_readlane_b32 s54, v24, 12
+; VI-NEXT:    v_readlane_b32 s53, v24, 11
+; VI-NEXT:    v_readlane_b32 s52, v24, 10
+; VI-NEXT:    v_readlane_b32 s51, v24, 9
+; VI-NEXT:    v_readlane_b32 s50, v24, 8
+; VI-NEXT:    v_readlane_b32 s49, v24, 7
+; VI-NEXT:    v_readlane_b32 s48, v24, 6
+; VI-NEXT:    v_readlane_b32 s39, v24, 5
+; VI-NEXT:    v_readlane_b32 s38, v24, 4
+; VI-NEXT:    v_readlane_b32 s37, v24, 3
+; VI-NEXT:    v_readlane_b32 s36, v24, 2
+; VI-NEXT:    v_readlane_b32 s35, v24, 1
+; VI-NEXT:    v_readlane_b32 s34, v24, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24646,7 +24646,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
+define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25332,7 +25332,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i64_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25340,10 +25340,10 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
+; SI-NEXT:    v_writelane_b32 v24, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s5, v9
 ; SI-NEXT:    v_readfirstlane_b32 s4, v8
 ; SI-NEXT:    v_readfirstlane_b32 s7, v7
@@ -25355,7 +25355,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s13, v1
 ; SI-NEXT:    s_cmp_lg_u32 s12, 0
 ; SI-NEXT:    v_readfirstlane_b32 s12, v0
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
+; SI-NEXT:    v_writelane_b32 v24, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s88, s5, 16
@@ -25505,6 +25505,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s14, s88, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s29
 ; SI-NEXT:    s_or_b32 s5, s5, s14
+; SI-NEXT:    v_readlane_b32 s30, v24, 2
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
@@ -25529,10 +25530,9 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v21, s7
 ; SI-NEXT:    v_mov_b32_e32 v22, s4
 ; SI-NEXT:    v_mov_b32_e32 v23, s5
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 3
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26120,7 +26120,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
+define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27279,7 +27279,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v12i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27287,29 +27287,29 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s6, v9
 ; SI-NEXT:    v_readfirstlane_b32 s8, v8
 ; SI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -27320,7 +27320,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s76, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s89, v0
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s31, 23
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s90, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -27653,30 +27653,30 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 22
+; SI-NEXT:    v_readlane_b32 s31, v32, 23
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27689,33 +27689,33 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s6, v9
 ; VI-NEXT:    v_readfirstlane_b32 s8, v8
 ; VI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -27726,7 +27726,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s79, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s74, s28, 16
 ; VI-NEXT:    s_lshr_b32 s77, s27, 16
@@ -27964,34 +27964,34 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 26
+; VI-NEXT:    v_readlane_b32 s31, v32, 27
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28287,7 +28287,7 @@ end:
   ret <12 x i64> %phi
 }
 
-define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) {
+define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28901,7 +28901,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28909,10 +28909,10 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s13, v9
 ; SI-NEXT:    v_readfirstlane_b32 s12, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -28924,7 +28924,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s5, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v0
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s35, s13, 16
@@ -29126,6 +29126,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v37
+; SI-NEXT:    v_readlane_b32 s30, v40, 2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v35
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v34
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v33
@@ -29138,10 +29139,9 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v25
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 3
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29913,7 +29913,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
+define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30960,7 +30960,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30968,29 +30968,29 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
 ; SI-NEXT:    v_readfirstlane_b32 s9, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -31001,7 +31001,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
+; SI-NEXT:    v_writelane_b32 v24, s31, 23
 ; SI-NEXT:    s_lshr_b32 s72, s29, 16
 ; SI-NEXT:    s_lshr_b32 s75, s28, 16
 ; SI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -31225,6 +31225,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v24, 22
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -31249,30 +31250,29 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v21, s57
 ; SI-NEXT:    v_mov_b32_e32 v22, s58
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 23
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -31288,33 +31288,33 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v24, s30, 0
-; VI-NEXT:    v_writelane_b32 v24, s31, 1
-; VI-NEXT:    v_writelane_b32 v24, s34, 2
-; VI-NEXT:    v_writelane_b32 v24, s35, 3
-; VI-NEXT:    v_writelane_b32 v24, s36, 4
-; VI-NEXT:    v_writelane_b32 v24, s37, 5
-; VI-NEXT:    v_writelane_b32 v24, s38, 6
-; VI-NEXT:    v_writelane_b32 v24, s39, 7
-; VI-NEXT:    v_writelane_b32 v24, s48, 8
-; VI-NEXT:    v_writelane_b32 v24, s49, 9
-; VI-NEXT:    v_writelane_b32 v24, s50, 10
-; VI-NEXT:    v_writelane_b32 v24, s51, 11
-; VI-NEXT:    v_writelane_b32 v24, s52, 12
-; VI-NEXT:    v_writelane_b32 v24, s53, 13
-; VI-NEXT:    v_writelane_b32 v24, s54, 14
-; VI-NEXT:    v_writelane_b32 v24, s55, 15
-; VI-NEXT:    v_writelane_b32 v24, s64, 16
-; VI-NEXT:    v_writelane_b32 v24, s65, 17
-; VI-NEXT:    v_writelane_b32 v24, s66, 18
-; VI-NEXT:    v_writelane_b32 v24, s67, 19
-; VI-NEXT:    v_writelane_b32 v24, s68, 20
-; VI-NEXT:    v_writelane_b32 v24, s69, 21
-; VI-NEXT:    v_writelane_b32 v24, s70, 22
-; VI-NEXT:    v_writelane_b32 v24, s71, 23
-; VI-NEXT:    v_writelane_b32 v24, s80, 24
-; VI-NEXT:    v_writelane_b32 v24, s81, 25
-; VI-NEXT:    v_writelane_b32 v24, s82, 26
+; VI-NEXT:    v_writelane_b32 v24, s34, 0
+; VI-NEXT:    v_writelane_b32 v24, s35, 1
+; VI-NEXT:    v_writelane_b32 v24, s36, 2
+; VI-NEXT:    v_writelane_b32 v24, s37, 3
+; VI-NEXT:    v_writelane_b32 v24, s38, 4
+; VI-NEXT:    v_writelane_b32 v24, s39, 5
+; VI-NEXT:    v_writelane_b32 v24, s48, 6
+; VI-NEXT:    v_writelane_b32 v24, s49, 7
+; VI-NEXT:    v_writelane_b32 v24, s50, 8
+; VI-NEXT:    v_writelane_b32 v24, s51, 9
+; VI-NEXT:    v_writelane_b32 v24, s52, 10
+; VI-NEXT:    v_writelane_b32 v24, s53, 11
+; VI-NEXT:    v_writelane_b32 v24, s54, 12
+; VI-NEXT:    v_writelane_b32 v24, s55, 13
+; VI-NEXT:    v_writelane_b32 v24, s64, 14
+; VI-NEXT:    v_writelane_b32 v24, s65, 15
+; VI-NEXT:    v_writelane_b32 v24, s66, 16
+; VI-NEXT:    v_writelane_b32 v24, s67, 17
+; VI-NEXT:    v_writelane_b32 v24, s68, 18
+; VI-NEXT:    v_writelane_b32 v24, s69, 19
+; VI-NEXT:    v_writelane_b32 v24, s70, 20
+; VI-NEXT:    v_writelane_b32 v24, s71, 21
+; VI-NEXT:    v_writelane_b32 v24, s80, 22
+; VI-NEXT:    v_writelane_b32 v24, s81, 23
+; VI-NEXT:    v_writelane_b32 v24, s82, 24
+; VI-NEXT:    v_writelane_b32 v24, s83, 25
+; VI-NEXT:    v_writelane_b32 v24, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s7, v9
 ; VI-NEXT:    v_readfirstlane_b32 s9, v8
 ; VI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -31325,7 +31325,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s88, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v24, s83, 27
+; VI-NEXT:    v_writelane_b32 v24, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s78, s27, 16
@@ -31549,6 +31549,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s59, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v24, 26
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -31573,34 +31574,33 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v21, s57
 ; VI-NEXT:    v_mov_b32_e32 v22, s58
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
-; VI-NEXT:    v_readlane_b32 s83, v24, 27
-; VI-NEXT:    v_readlane_b32 s82, v24, 26
-; VI-NEXT:    v_readlane_b32 s81, v24, 25
-; VI-NEXT:    v_readlane_b32 s80, v24, 24
-; VI-NEXT:    v_readlane_b32 s71, v24, 23
-; VI-NEXT:    v_readlane_b32 s70, v24, 22
-; VI-NEXT:    v_readlane_b32 s69, v24, 21
-; VI-NEXT:    v_readlane_b32 s68, v24, 20
-; VI-NEXT:    v_readlane_b32 s67, v24, 19
-; VI-NEXT:    v_readlane_b32 s66, v24, 18
-; VI-NEXT:    v_readlane_b32 s65, v24, 17
-; VI-NEXT:    v_readlane_b32 s64, v24, 16
-; VI-NEXT:    v_readlane_b32 s55, v24, 15
-; VI-NEXT:    v_readlane_b32 s54, v24, 14
-; VI-NEXT:    v_readlane_b32 s53, v24, 13
-; VI-NEXT:    v_readlane_b32 s52, v24, 12
-; VI-NEXT:    v_readlane_b32 s51, v24, 11
-; VI-NEXT:    v_readlane_b32 s50, v24, 10
-; VI-NEXT:    v_readlane_b32 s49, v24, 9
-; VI-NEXT:    v_readlane_b32 s48, v24, 8
-; VI-NEXT:    v_readlane_b32 s39, v24, 7
-; VI-NEXT:    v_readlane_b32 s38, v24, 6
-; VI-NEXT:    v_readlane_b32 s37, v24, 5
-; VI-NEXT:    v_readlane_b32 s36, v24, 4
-; VI-NEXT:    v_readlane_b32 s35, v24, 3
-; VI-NEXT:    v_readlane_b32 s34, v24, 2
-; VI-NEXT:    v_readlane_b32 s31, v24, 1
-; VI-NEXT:    v_readlane_b32 s30, v24, 0
+; VI-NEXT:    v_readlane_b32 s31, v24, 27
+; VI-NEXT:    v_readlane_b32 s83, v24, 25
+; VI-NEXT:    v_readlane_b32 s82, v24, 24
+; VI-NEXT:    v_readlane_b32 s81, v24, 23
+; VI-NEXT:    v_readlane_b32 s80, v24, 22
+; VI-NEXT:    v_readlane_b32 s71, v24, 21
+; VI-NEXT:    v_readlane_b32 s70, v24, 20
+; VI-NEXT:    v_readlane_b32 s69, v24, 19
+; VI-NEXT:    v_readlane_b32 s68, v24, 18
+; VI-NEXT:    v_readlane_b32 s67, v24, 17
+; VI-NEXT:    v_readlane_b32 s66, v24, 16
+; VI-NEXT:    v_readlane_b32 s65, v24, 15
+; VI-NEXT:    v_readlane_b32 s64, v24, 14
+; VI-NEXT:    v_readlane_b32 s55, v24, 13
+; VI-NEXT:    v_readlane_b32 s54, v24, 12
+; VI-NEXT:    v_readlane_b32 s53, v24, 11
+; VI-NEXT:    v_readlane_b32 s52, v24, 10
+; VI-NEXT:    v_readlane_b32 s51, v24, 9
+; VI-NEXT:    v_readlane_b32 s50, v24, 8
+; VI-NEXT:    v_readlane_b32 s49, v24, 7
+; VI-NEXT:    v_readlane_b32 s48, v24, 6
+; VI-NEXT:    v_readlane_b32 s39, v24, 5
+; VI-NEXT:    v_readlane_b32 s38, v24, 4
+; VI-NEXT:    v_readlane_b32 s37, v24, 3
+; VI-NEXT:    v_readlane_b32 s36, v24, 2
+; VI-NEXT:    v_readlane_b32 s35, v24, 1
+; VI-NEXT:    v_readlane_b32 s34, v24, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -31898,7 +31898,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
+define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32512,7 +32512,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12f64_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32520,10 +32520,10 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v40, s30, 0
-; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 0
+; SI-NEXT:    v_writelane_b32 v40, s35, 1
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
-; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s30, 2
 ; SI-NEXT:    v_readfirstlane_b32 s13, v9
 ; SI-NEXT:    v_readfirstlane_b32 s12, v8
 ; SI-NEXT:    v_readfirstlane_b32 s11, v7
@@ -32535,7 +32535,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s5, v1
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_readfirstlane_b32 s4, v0
-; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s31, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s35, s13, 16
@@ -32737,6 +32737,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v37
+; SI-NEXT:    v_readlane_b32 s30, v40, 2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v35
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v34
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v33
@@ -32749,10 +32750,9 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v25
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_readlane_b32 s35, v40, 3
-; SI-NEXT:    v_readlane_b32 s34, v40, 2
-; SI-NEXT:    v_readlane_b32 s31, v40, 1
-; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    v_readlane_b32 s31, v40, 3
+; SI-NEXT:    v_readlane_b32 s35, v40, 1
+; SI-NEXT:    v_readlane_b32 s34, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -33524,7 +33524,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
+define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34683,7 +34683,7 @@ end:
   ret <12 x double> %phi
 }
 
-define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v12f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34691,29 +34691,29 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s30, 22
 ; SI-NEXT:    v_readfirstlane_b32 s6, v9
 ; SI-NEXT:    v_readfirstlane_b32 s8, v8
 ; SI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -34724,7 +34724,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s76, v2
 ; SI-NEXT:    v_readfirstlane_b32 s79, v1
 ; SI-NEXT:    v_readfirstlane_b32 s89, v0
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
+; SI-NEXT:    v_writelane_b32 v32, s31, 23
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s90, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -35057,30 +35057,30 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 22
+; SI-NEXT:    v_readlane_b32 s31, v32, 23
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35093,33 +35093,33 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s30, 26
 ; VI-NEXT:    v_readfirstlane_b32 s6, v9
 ; VI-NEXT:    v_readfirstlane_b32 s8, v8
 ; VI-NEXT:    v_readfirstlane_b32 s10, v7
@@ -35130,7 +35130,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s79, v2
 ; VI-NEXT:    v_readfirstlane_b32 s91, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s31, 27
 ; VI-NEXT:    s_lshr_b32 s72, s29, 16
 ; VI-NEXT:    s_lshr_b32 s74, s28, 16
 ; VI-NEXT:    s_lshr_b32 s77, s27, 16
@@ -35368,34 +35368,34 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 26
+; VI-NEXT:    v_readlane_b32 s31, v32, 27
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35691,7 +35691,7 @@ end:
   ret <12 x double> %phi
 }
 
-define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
+define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v48f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36786,7 +36786,7 @@ end:
   ret <48 x half> %phi
 }
 
-define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48i16_to_v48f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36794,42 +36794,40 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v24, s30, 0
-; SI-NEXT:    v_writelane_b32 v24, s31, 1
-; SI-NEXT:    v_writelane_b32 v24, s34, 2
-; SI-NEXT:    v_writelane_b32 v24, s35, 3
-; SI-NEXT:    v_writelane_b32 v24, s36, 4
-; SI-NEXT:    v_writelane_b32 v24, s37, 5
-; SI-NEXT:    v_writelane_b32 v24, s38, 6
-; SI-NEXT:    v_writelane_b32 v24, s39, 7
-; SI-NEXT:    v_writelane_b32 v24, s48, 8
-; SI-NEXT:    v_writelane_b32 v24, s49, 9
-; SI-NEXT:    v_writelane_b32 v24, s50, 10
-; SI-NEXT:    v_writelane_b32 v24, s51, 11
-; SI-NEXT:    v_writelane_b32 v24, s52, 12
-; SI-NEXT:    v_writelane_b32 v24, s53, 13
-; SI-NEXT:    v_writelane_b32 v24, s54, 14
-; SI-NEXT:    v_writelane_b32 v24, s55, 15
-; SI-NEXT:    v_writelane_b32 v24, s64, 16
-; SI-NEXT:    v_writelane_b32 v24, s65, 17
-; SI-NEXT:    v_writelane_b32 v24, s66, 18
-; SI-NEXT:    v_writelane_b32 v24, s67, 19
-; SI-NEXT:    v_writelane_b32 v24, s68, 20
-; SI-NEXT:    v_writelane_b32 v24, s69, 21
-; SI-NEXT:    v_writelane_b32 v24, s70, 22
-; SI-NEXT:    v_writelane_b32 v24, s71, 23
-; SI-NEXT:    v_writelane_b32 v24, s80, 24
-; SI-NEXT:    v_writelane_b32 v24, s81, 25
-; SI-NEXT:    v_writelane_b32 v24, s82, 26
-; SI-NEXT:    v_writelane_b32 v24, s83, 27
-; SI-NEXT:    v_writelane_b32 v24, s84, 28
-; SI-NEXT:    v_writelane_b32 v24, s85, 29
-; SI-NEXT:    v_writelane_b32 v24, s86, 30
-; SI-NEXT:    v_writelane_b32 v24, s87, 31
-; SI-NEXT:    v_writelane_b32 v24, s96, 32
-; SI-NEXT:    v_writelane_b32 v24, s97, 33
-; SI-NEXT:    v_writelane_b32 v24, s98, 34
-; SI-NEXT:    v_writelane_b32 v24, s99, 35
+; SI-NEXT:    v_writelane_b32 v24, s34, 0
+; SI-NEXT:    v_writelane_b32 v24, s35, 1
+; SI-NEXT:    v_writelane_b32 v24, s36, 2
+; SI-NEXT:    v_writelane_b32 v24, s37, 3
+; SI-NEXT:    v_writelane_b32 v24, s38, 4
+; SI-NEXT:    v_writelane_b32 v24, s39, 5
+; SI-NEXT:    v_writelane_b32 v24, s48, 6
+; SI-NEXT:    v_writelane_b32 v24, s49, 7
+; SI-NEXT:    v_writelane_b32 v24, s50, 8
+; SI-NEXT:    v_writelane_b32 v24, s51, 9
+; SI-NEXT:    v_writelane_b32 v24, s52, 10
+; SI-NEXT:    v_writelane_b32 v24, s53, 11
+; SI-NEXT:    v_writelane_b32 v24, s54, 12
+; SI-NEXT:    v_writelane_b32 v24, s55, 13
+; SI-NEXT:    v_writelane_b32 v24, s64, 14
+; SI-NEXT:    v_writelane_b32 v24, s65, 15
+; SI-NEXT:    v_writelane_b32 v24, s66, 16
+; SI-NEXT:    v_writelane_b32 v24, s67, 17
+; SI-NEXT:    v_writelane_b32 v24, s68, 18
+; SI-NEXT:    v_writelane_b32 v24, s69, 19
+; SI-NEXT:    v_writelane_b32 v24, s70, 20
+; SI-NEXT:    v_writelane_b32 v24, s71, 21
+; SI-NEXT:    v_writelane_b32 v24, s80, 22
+; SI-NEXT:    v_writelane_b32 v24, s81, 23
+; SI-NEXT:    v_writelane_b32 v24, s82, 24
+; SI-NEXT:    v_writelane_b32 v24, s83, 25
+; SI-NEXT:    v_writelane_b32 v24, s84, 26
+; SI-NEXT:    v_writelane_b32 v24, s85, 27
+; SI-NEXT:    v_writelane_b32 v24, s86, 28
+; SI-NEXT:    v_writelane_b32 v24, s87, 29
+; SI-NEXT:    v_writelane_b32 v24, s96, 30
+; SI-NEXT:    v_writelane_b32 v24, s97, 31
+; SI-NEXT:    v_writelane_b32 v24, s98, 32
+; SI-NEXT:    v_writelane_b32 v24, s99, 33
 ; SI-NEXT:    v_readfirstlane_b32 s99, v9
 ; SI-NEXT:    v_readfirstlane_b32 s65, v8
 ; SI-NEXT:    v_readfirstlane_b32 s96, v7
@@ -36865,7 +36863,9 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    s_lshr_b32 s51, s71, 16
 ; SI-NEXT:    s_lshr_b32 s80, s81, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
+; SI-NEXT:    v_writelane_b32 v24, s30, 34
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-NEXT:    v_writelane_b32 v24, s31, 35
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s5, s17, 0xffff
@@ -37187,6 +37187,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s27, s47, 0xffff
 ; SI-NEXT:    s_lshl_b32 s28, s55, 16
 ; SI-NEXT:    s_or_b32 s27, s27, s28
+; SI-NEXT:    v_readlane_b32 s30, v24, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -37211,42 +37212,41 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v21, s25
 ; SI-NEXT:    v_mov_b32_e32 v22, s26
 ; SI-NEXT:    v_mov_b32_e32 v23, s27
-; SI-NEXT:    v_readlane_b32 s99, v24, 35
-; SI-NEXT:    v_readlane_b32 s98, v24, 34
-; SI-NEXT:    v_readlane_b32 s97, v24, 33
-; SI-NEXT:    v_readlane_b32 s96, v24, 32
-; SI-NEXT:    v_readlane_b32 s87, v24, 31
-; SI-NEXT:    v_readlane_b32 s86, v24, 30
-; SI-NEXT:    v_readlane_b32 s85, v24, 29
-; SI-NEXT:    v_readlane_b32 s84, v24, 28
-; SI-NEXT:    v_readlane_b32 s83, v24, 27
-; SI-NEXT:    v_readlane_b32 s82, v24, 26
-; SI-NEXT:    v_readlane_b32 s81, v24, 25
-; SI-NEXT:    v_readlane_b32 s80, v24, 24
-; SI-NEXT:    v_readlane_b32 s71, v24, 23
-; SI-NEXT:    v_readlane_b32 s70, v24, 22
-; SI-NEXT:    v_readlane_b32 s69, v24, 21
-; SI-NEXT:    v_readlane_b32 s68, v24, 20
-; SI-NEXT:    v_readlane_b32 s67, v24, 19
-; SI-NEXT:    v_readlane_b32 s66, v24, 18
-; SI-NEXT:    v_readlane_b32 s65, v24, 17
-; SI-NEXT:    v_readlane_b32 s64, v24, 16
-; SI-NEXT:    v_readlane_b32 s55, v24, 15
-; SI-NEXT:    v_readlane_b32 s54, v24, 14
-; SI-NEXT:    v_readlane_b32 s53, v24, 13
-; SI-NEXT:    v_readlane_b32 s52, v24, 12
-; SI-NEXT:    v_readlane_b32 s51, v24, 11
-; SI-NEXT:    v_readlane_b32 s50, v24, 10
-; SI-NEXT:    v_readlane_b32 s49, v24, 9
-; SI-NEXT:    v_readlane_b32 s48, v24, 8
-; SI-NEXT:    v_readlane_b32 s39, v24, 7
-; SI-NEXT:    v_readlane_b32 s38, v24, 6
-; SI-NEXT:    v_readlane_b32 s37, v24, 5
-; SI-NEXT:    v_readlane_b32 s36, v24, 4
-; SI-NEXT:    v_readlane_b32 s35, v24, 3
-; SI-NEXT:    v_readlane_b32 s34, v24, 2
-; SI-NEXT:    v_readlane_b32 s31, v24, 1
-; SI-NEXT:    v_readlane_b32 s30, v24, 0
+; SI-NEXT:    v_readlane_b32 s31, v24, 35
+; SI-NEXT:    v_readlane_b32 s99, v24, 33
+; SI-NEXT:    v_readlane_b32 s98, v24, 32
+; SI-NEXT:    v_readlane_b32 s97, v24, 31
+; SI-NEXT:    v_readlane_b32 s96, v24, 30
+; SI-NEXT:    v_readlane_b32 s87, v24, 29
+; SI-NEXT:    v_readlane_b32 s86, v24, 28
+; SI-NEXT:    v_readlane_b32 s85, v24, 27
+; SI-NEXT:    v_readlane_b32 s84, v24, 26
+; SI-NEXT:    v_readlane_b32 s83, v24, 25
+; SI-NEXT:    v_readlane_b32 s82, v24, 24
+; SI-NEXT:    v_readlane_b32 s81, v24, 23
+; SI-NEXT:    v_readlane_b32 s80, v24, 22
+; SI-NEXT:    v_readlane_b32 s71, v24, 21
+; SI-NEXT:    v_readlane_b32 s70, v24, 20
+; SI-NEXT:    v_readlane_b32 s69, v24, 19
+; SI-NEXT:    v_readlane_b32 s68, v24, 18
+; SI-NEXT:    v_readlane_b32 s67, v24, 17
+; SI-NEXT:    v_readlane_b32 s66, v24, 16
+; SI-NEXT:    v_readlane_b32 s65, v24, 15
+; SI-NEXT:    v_readlane_b32 s64, v24, 14
+; SI-NEXT:    v_readlane_b32 s55, v24, 13
+; SI-NEXT:    v_readlane_b32 s54, v24, 12
+; SI-NEXT:    v_readlane_b32 s53, v24, 11
+; SI-NEXT:    v_readlane_b32 s52, v24, 10
+; SI-NEXT:    v_readlane_b32 s51, v24, 9
+; SI-NEXT:    v_readlane_b32 s50, v24, 8
+; SI-NEXT:    v_readlane_b32 s49, v24, 7
+; SI-NEXT:    v_readlane_b32 s48, v24, 6
+; SI-NEXT:    v_readlane_b32 s39, v24, 5
+; SI-NEXT:    v_readlane_b32 s38, v24, 4
+; SI-NEXT:    v_readlane_b32 s37, v24, 3
+; SI-NEXT:    v_readlane_b32 s36, v24, 2
+; SI-NEXT:    v_readlane_b32 s35, v24, 1
+; SI-NEXT:    v_readlane_b32 s34, v24, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -38067,7 +38067,7 @@ end:
   ret <48 x half> %phi
 }
 
-define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
+define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v48i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38843,7 +38843,7 @@ end:
   ret <48 x i16> %phi
 }
 
-define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40069,3 +40069,5 @@ end:
   %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <48 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index b52e8a54540dc..272038cfc4881 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) {
+define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -180,7 +180,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -495,7 +495,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) {
+define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -656,7 +656,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1184,7 +1184,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) {
+define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1358,7 +1358,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1673,7 +1673,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) {
+define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1854,7 +1854,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2169,7 +2169,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) {
+define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2343,7 +2343,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2658,7 +2658,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) {
+define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2780,7 +2780,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3256,7 +3256,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
+define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4014,7 +4014,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4022,15 +4022,15 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
+; SI-NEXT:    v_writelane_b32 v26, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -4044,7 +4044,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s15, v1
 ; SI-NEXT:    s_cmp_lg_u32 s14, 0
 ; SI-NEXT:    v_readfirstlane_b32 s14, v0
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
+; SI-NEXT:    v_writelane_b32 v26, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s92, s5, 16
@@ -4206,6 +4206,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s40, s92, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s41
 ; SI-NEXT:    s_or_b32 s5, s5, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 7
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -4232,15 +4233,14 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v23, s7
 ; SI-NEXT:    v_mov_b32_e32 v24, s4
 ; SI-NEXT:    v_mov_b32_e32 v25, s5
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 8
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4875,7 +4875,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
+define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6024,7 +6024,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6032,33 +6032,34 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s30, 26
+; SI-NEXT:    v_writelane_b32 v26, s31, 27
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
 ; SI-NEXT:    v_readfirstlane_b32 s9, v10
 ; SI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -6071,7 +6072,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s93, v2
 ; SI-NEXT:    v_readfirstlane_b32 s30, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
 ; SI-NEXT:    s_lshr_b32 s76, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -6313,6 +6313,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v26, 26
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -6339,34 +6340,33 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s60
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 27
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -6382,37 +6382,38 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v26, s30, 0
-; VI-NEXT:    v_writelane_b32 v26, s31, 1
-; VI-NEXT:    v_writelane_b32 v26, s34, 2
-; VI-NEXT:    v_writelane_b32 v26, s35, 3
-; VI-NEXT:    v_writelane_b32 v26, s36, 4
-; VI-NEXT:    v_writelane_b32 v26, s37, 5
-; VI-NEXT:    v_writelane_b32 v26, s38, 6
-; VI-NEXT:    v_writelane_b32 v26, s39, 7
-; VI-NEXT:    v_writelane_b32 v26, s48, 8
-; VI-NEXT:    v_writelane_b32 v26, s49, 9
-; VI-NEXT:    v_writelane_b32 v26, s50, 10
-; VI-NEXT:    v_writelane_b32 v26, s51, 11
-; VI-NEXT:    v_writelane_b32 v26, s52, 12
-; VI-NEXT:    v_writelane_b32 v26, s53, 13
-; VI-NEXT:    v_writelane_b32 v26, s54, 14
-; VI-NEXT:    v_writelane_b32 v26, s55, 15
-; VI-NEXT:    v_writelane_b32 v26, s64, 16
-; VI-NEXT:    v_writelane_b32 v26, s65, 17
-; VI-NEXT:    v_writelane_b32 v26, s66, 18
-; VI-NEXT:    v_writelane_b32 v26, s67, 19
-; VI-NEXT:    v_writelane_b32 v26, s68, 20
-; VI-NEXT:    v_writelane_b32 v26, s69, 21
-; VI-NEXT:    v_writelane_b32 v26, s70, 22
-; VI-NEXT:    v_writelane_b32 v26, s71, 23
-; VI-NEXT:    v_writelane_b32 v26, s80, 24
-; VI-NEXT:    v_writelane_b32 v26, s81, 25
-; VI-NEXT:    v_writelane_b32 v26, s82, 26
-; VI-NEXT:    v_writelane_b32 v26, s83, 27
-; VI-NEXT:    v_writelane_b32 v26, s84, 28
-; VI-NEXT:    v_writelane_b32 v26, s85, 29
-; VI-NEXT:    v_writelane_b32 v26, s86, 30
+; VI-NEXT:    v_writelane_b32 v26, s34, 0
+; VI-NEXT:    v_writelane_b32 v26, s35, 1
+; VI-NEXT:    v_writelane_b32 v26, s36, 2
+; VI-NEXT:    v_writelane_b32 v26, s37, 3
+; VI-NEXT:    v_writelane_b32 v26, s38, 4
+; VI-NEXT:    v_writelane_b32 v26, s39, 5
+; VI-NEXT:    v_writelane_b32 v26, s48, 6
+; VI-NEXT:    v_writelane_b32 v26, s49, 7
+; VI-NEXT:    v_writelane_b32 v26, s50, 8
+; VI-NEXT:    v_writelane_b32 v26, s51, 9
+; VI-NEXT:    v_writelane_b32 v26, s52, 10
+; VI-NEXT:    v_writelane_b32 v26, s53, 11
+; VI-NEXT:    v_writelane_b32 v26, s54, 12
+; VI-NEXT:    v_writelane_b32 v26, s55, 13
+; VI-NEXT:    v_writelane_b32 v26, s64, 14
+; VI-NEXT:    v_writelane_b32 v26, s65, 15
+; VI-NEXT:    v_writelane_b32 v26, s66, 16
+; VI-NEXT:    v_writelane_b32 v26, s67, 17
+; VI-NEXT:    v_writelane_b32 v26, s68, 18
+; VI-NEXT:    v_writelane_b32 v26, s69, 19
+; VI-NEXT:    v_writelane_b32 v26, s70, 20
+; VI-NEXT:    v_writelane_b32 v26, s71, 21
+; VI-NEXT:    v_writelane_b32 v26, s80, 22
+; VI-NEXT:    v_writelane_b32 v26, s81, 23
+; VI-NEXT:    v_writelane_b32 v26, s82, 24
+; VI-NEXT:    v_writelane_b32 v26, s83, 25
+; VI-NEXT:    v_writelane_b32 v26, s84, 26
+; VI-NEXT:    v_writelane_b32 v26, s85, 27
+; VI-NEXT:    v_writelane_b32 v26, s86, 28
+; VI-NEXT:    v_writelane_b32 v26, s87, 29
+; VI-NEXT:    v_writelane_b32 v26, s30, 30
+; VI-NEXT:    v_writelane_b32 v26, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v10
 ; VI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -6425,7 +6426,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s31, v2
 ; VI-NEXT:    v_readfirstlane_b32 s68, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v26, s87, 31
 ; VI-NEXT:    s_lshr_b32 s76, s29, 16
 ; VI-NEXT:    s_lshr_b32 s79, s28, 16
 ; VI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -6667,6 +6667,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v26, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -6693,38 +6694,37 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
 ; VI-NEXT:    v_mov_b32_e32 v24, s60
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
-; VI-NEXT:    v_readlane_b32 s87, v26, 31
-; VI-NEXT:    v_readlane_b32 s86, v26, 30
-; VI-NEXT:    v_readlane_b32 s85, v26, 29
-; VI-NEXT:    v_readlane_b32 s84, v26, 28
-; VI-NEXT:    v_readlane_b32 s83, v26, 27
-; VI-NEXT:    v_readlane_b32 s82, v26, 26
-; VI-NEXT:    v_readlane_b32 s81, v26, 25
-; VI-NEXT:    v_readlane_b32 s80, v26, 24
-; VI-NEXT:    v_readlane_b32 s71, v26, 23
-; VI-NEXT:    v_readlane_b32 s70, v26, 22
-; VI-NEXT:    v_readlane_b32 s69, v26, 21
-; VI-NEXT:    v_readlane_b32 s68, v26, 20
-; VI-NEXT:    v_readlane_b32 s67, v26, 19
-; VI-NEXT:    v_readlane_b32 s66, v26, 18
-; VI-NEXT:    v_readlane_b32 s65, v26, 17
-; VI-NEXT:    v_readlane_b32 s64, v26, 16
-; VI-NEXT:    v_readlane_b32 s55, v26, 15
-; VI-NEXT:    v_readlane_b32 s54, v26, 14
-; VI-NEXT:    v_readlane_b32 s53, v26, 13
-; VI-NEXT:    v_readlane_b32 s52, v26, 12
-; VI-NEXT:    v_readlane_b32 s51, v26, 11
-; VI-NEXT:    v_readlane_b32 s50, v26, 10
-; VI-NEXT:    v_readlane_b32 s49, v26, 9
-; VI-NEXT:    v_readlane_b32 s48, v26, 8
-; VI-NEXT:    v_readlane_b32 s39, v26, 7
-; VI-NEXT:    v_readlane_b32 s38, v26, 6
-; VI-NEXT:    v_readlane_b32 s37, v26, 5
-; VI-NEXT:    v_readlane_b32 s36, v26, 4
-; VI-NEXT:    v_readlane_b32 s35, v26, 3
-; VI-NEXT:    v_readlane_b32 s34, v26, 2
-; VI-NEXT:    v_readlane_b32 s31, v26, 1
-; VI-NEXT:    v_readlane_b32 s30, v26, 0
+; VI-NEXT:    v_readlane_b32 s31, v26, 31
+; VI-NEXT:    v_readlane_b32 s87, v26, 29
+; VI-NEXT:    v_readlane_b32 s86, v26, 28
+; VI-NEXT:    v_readlane_b32 s85, v26, 27
+; VI-NEXT:    v_readlane_b32 s84, v26, 26
+; VI-NEXT:    v_readlane_b32 s83, v26, 25
+; VI-NEXT:    v_readlane_b32 s82, v26, 24
+; VI-NEXT:    v_readlane_b32 s81, v26, 23
+; VI-NEXT:    v_readlane_b32 s80, v26, 22
+; VI-NEXT:    v_readlane_b32 s71, v26, 21
+; VI-NEXT:    v_readlane_b32 s70, v26, 20
+; VI-NEXT:    v_readlane_b32 s69, v26, 19
+; VI-NEXT:    v_readlane_b32 s68, v26, 18
+; VI-NEXT:    v_readlane_b32 s67, v26, 17
+; VI-NEXT:    v_readlane_b32 s66, v26, 16
+; VI-NEXT:    v_readlane_b32 s65, v26, 15
+; VI-NEXT:    v_readlane_b32 s64, v26, 14
+; VI-NEXT:    v_readlane_b32 s55, v26, 13
+; VI-NEXT:    v_readlane_b32 s54, v26, 12
+; VI-NEXT:    v_readlane_b32 s53, v26, 11
+; VI-NEXT:    v_readlane_b32 s52, v26, 10
+; VI-NEXT:    v_readlane_b32 s51, v26, 9
+; VI-NEXT:    v_readlane_b32 s50, v26, 8
+; VI-NEXT:    v_readlane_b32 s49, v26, 7
+; VI-NEXT:    v_readlane_b32 s48, v26, 6
+; VI-NEXT:    v_readlane_b32 s39, v26, 5
+; VI-NEXT:    v_readlane_b32 s38, v26, 4
+; VI-NEXT:    v_readlane_b32 s37, v26, 3
+; VI-NEXT:    v_readlane_b32 s36, v26, 2
+; VI-NEXT:    v_readlane_b32 s35, v26, 1
+; VI-NEXT:    v_readlane_b32 s34, v26, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -7038,7 +7038,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
+define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7796,7 +7796,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26i32_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7804,15 +7804,15 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
+; SI-NEXT:    v_writelane_b32 v26, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -7826,7 +7826,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s15, v1
 ; SI-NEXT:    s_cmp_lg_u32 s14, 0
 ; SI-NEXT:    v_readfirstlane_b32 s14, v0
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
+; SI-NEXT:    v_writelane_b32 v26, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s92, s5, 16
@@ -7988,6 +7988,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s40, s92, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s41
 ; SI-NEXT:    s_or_b32 s5, s5, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 7
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -8014,15 +8015,14 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v23, s7
 ; SI-NEXT:    v_mov_b32_e32 v24, s4
 ; SI-NEXT:    v_mov_b32_e32 v25, s5
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 8
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8657,7 +8657,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
+define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9927,7 +9927,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v26i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9935,33 +9935,33 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s30, 26
 ; SI-NEXT:    v_readfirstlane_b32 s6, v11
 ; SI-NEXT:    v_readfirstlane_b32 s8, v10
 ; SI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -9974,7 +9974,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
+; SI-NEXT:    v_writelane_b32 v32, s31, 27
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
@@ -10331,34 +10331,34 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 26
+; SI-NEXT:    v_readlane_b32 s31, v32, 27
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10371,37 +10371,38 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s6, v11
 ; VI-NEXT:    v_readfirstlane_b32 s8, v10
 ; VI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -10414,7 +10415,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s30, v2
 ; VI-NEXT:    v_readfirstlane_b32 s35, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s74, s29, 16
 ; VI-NEXT:    s_lshr_b32 s77, s28, 16
 ; VI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -10668,38 +10668,38 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -11011,7 +11011,7 @@ end:
   ret <26 x i32> %phi
 }
 
-define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) {
+define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11172,7 +11172,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11700,7 +11700,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) {
+define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11881,7 +11881,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12196,7 +12196,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) {
+define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12357,7 +12357,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12885,7 +12885,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) {
+define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13007,7 +13007,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13483,7 +13483,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
+define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14215,7 +14215,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14223,15 +14223,15 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v45, s30, 0
-; SI-NEXT:    v_writelane_b32 v45, s31, 1
-; SI-NEXT:    v_writelane_b32 v45, s34, 2
-; SI-NEXT:    v_writelane_b32 v45, s35, 3
-; SI-NEXT:    v_writelane_b32 v45, s36, 4
-; SI-NEXT:    v_writelane_b32 v45, s37, 5
-; SI-NEXT:    v_writelane_b32 v45, s38, 6
+; SI-NEXT:    v_writelane_b32 v45, s34, 0
+; SI-NEXT:    v_writelane_b32 v45, s35, 1
+; SI-NEXT:    v_writelane_b32 v45, s36, 2
+; SI-NEXT:    v_writelane_b32 v45, s37, 3
+; SI-NEXT:    v_writelane_b32 v45, s38, 4
+; SI-NEXT:    v_writelane_b32 v45, s39, 5
+; SI-NEXT:    v_writelane_b32 v45, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v45, s39, 7
+; SI-NEXT:    v_writelane_b32 v45, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -14250,7 +14250,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v45, s48, 8
+; SI-NEXT:    v_writelane_b32 v45, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s5, 16
@@ -14492,6 +14492,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v48
+; SI-NEXT:    v_readlane_b32 s30, v45, 7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v38
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v37
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v36
@@ -14505,15 +14506,14 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v28
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_readlane_b32 s48, v45, 8
-; SI-NEXT:    v_readlane_b32 s39, v45, 7
-; SI-NEXT:    v_readlane_b32 s38, v45, 6
-; SI-NEXT:    v_readlane_b32 s37, v45, 5
-; SI-NEXT:    v_readlane_b32 s36, v45, 4
-; SI-NEXT:    v_readlane_b32 s35, v45, 3
-; SI-NEXT:    v_readlane_b32 s34, v45, 2
-; SI-NEXT:    v_readlane_b32 s31, v45, 1
-; SI-NEXT:    v_readlane_b32 s30, v45, 0
+; SI-NEXT:    v_readlane_b32 s31, v45, 8
+; SI-NEXT:    v_readlane_b32 s48, v45, 6
+; SI-NEXT:    v_readlane_b32 s39, v45, 5
+; SI-NEXT:    v_readlane_b32 s38, v45, 4
+; SI-NEXT:    v_readlane_b32 s37, v45, 3
+; SI-NEXT:    v_readlane_b32 s36, v45, 2
+; SI-NEXT:    v_readlane_b32 s35, v45, 1
+; SI-NEXT:    v_readlane_b32 s34, v45, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15417,7 +15417,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
+define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16566,7 +16566,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16574,33 +16574,34 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s30, 26
+; SI-NEXT:    v_writelane_b32 v26, s31, 27
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
 ; SI-NEXT:    v_readfirstlane_b32 s9, v10
 ; SI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -16613,7 +16614,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s93, v2
 ; SI-NEXT:    v_readfirstlane_b32 s30, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
 ; SI-NEXT:    s_lshr_b32 s76, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -16855,6 +16855,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v26, 26
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -16881,34 +16882,33 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s60
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 27
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16924,37 +16924,38 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v26, s30, 0
-; VI-NEXT:    v_writelane_b32 v26, s31, 1
-; VI-NEXT:    v_writelane_b32 v26, s34, 2
-; VI-NEXT:    v_writelane_b32 v26, s35, 3
-; VI-NEXT:    v_writelane_b32 v26, s36, 4
-; VI-NEXT:    v_writelane_b32 v26, s37, 5
-; VI-NEXT:    v_writelane_b32 v26, s38, 6
-; VI-NEXT:    v_writelane_b32 v26, s39, 7
-; VI-NEXT:    v_writelane_b32 v26, s48, 8
-; VI-NEXT:    v_writelane_b32 v26, s49, 9
-; VI-NEXT:    v_writelane_b32 v26, s50, 10
-; VI-NEXT:    v_writelane_b32 v26, s51, 11
-; VI-NEXT:    v_writelane_b32 v26, s52, 12
-; VI-NEXT:    v_writelane_b32 v26, s53, 13
-; VI-NEXT:    v_writelane_b32 v26, s54, 14
-; VI-NEXT:    v_writelane_b32 v26, s55, 15
-; VI-NEXT:    v_writelane_b32 v26, s64, 16
-; VI-NEXT:    v_writelane_b32 v26, s65, 17
-; VI-NEXT:    v_writelane_b32 v26, s66, 18
-; VI-NEXT:    v_writelane_b32 v26, s67, 19
-; VI-NEXT:    v_writelane_b32 v26, s68, 20
-; VI-NEXT:    v_writelane_b32 v26, s69, 21
-; VI-NEXT:    v_writelane_b32 v26, s70, 22
-; VI-NEXT:    v_writelane_b32 v26, s71, 23
-; VI-NEXT:    v_writelane_b32 v26, s80, 24
-; VI-NEXT:    v_writelane_b32 v26, s81, 25
-; VI-NEXT:    v_writelane_b32 v26, s82, 26
-; VI-NEXT:    v_writelane_b32 v26, s83, 27
-; VI-NEXT:    v_writelane_b32 v26, s84, 28
-; VI-NEXT:    v_writelane_b32 v26, s85, 29
-; VI-NEXT:    v_writelane_b32 v26, s86, 30
+; VI-NEXT:    v_writelane_b32 v26, s34, 0
+; VI-NEXT:    v_writelane_b32 v26, s35, 1
+; VI-NEXT:    v_writelane_b32 v26, s36, 2
+; VI-NEXT:    v_writelane_b32 v26, s37, 3
+; VI-NEXT:    v_writelane_b32 v26, s38, 4
+; VI-NEXT:    v_writelane_b32 v26, s39, 5
+; VI-NEXT:    v_writelane_b32 v26, s48, 6
+; VI-NEXT:    v_writelane_b32 v26, s49, 7
+; VI-NEXT:    v_writelane_b32 v26, s50, 8
+; VI-NEXT:    v_writelane_b32 v26, s51, 9
+; VI-NEXT:    v_writelane_b32 v26, s52, 10
+; VI-NEXT:    v_writelane_b32 v26, s53, 11
+; VI-NEXT:    v_writelane_b32 v26, s54, 12
+; VI-NEXT:    v_writelane_b32 v26, s55, 13
+; VI-NEXT:    v_writelane_b32 v26, s64, 14
+; VI-NEXT:    v_writelane_b32 v26, s65, 15
+; VI-NEXT:    v_writelane_b32 v26, s66, 16
+; VI-NEXT:    v_writelane_b32 v26, s67, 17
+; VI-NEXT:    v_writelane_b32 v26, s68, 18
+; VI-NEXT:    v_writelane_b32 v26, s69, 19
+; VI-NEXT:    v_writelane_b32 v26, s70, 20
+; VI-NEXT:    v_writelane_b32 v26, s71, 21
+; VI-NEXT:    v_writelane_b32 v26, s80, 22
+; VI-NEXT:    v_writelane_b32 v26, s81, 23
+; VI-NEXT:    v_writelane_b32 v26, s82, 24
+; VI-NEXT:    v_writelane_b32 v26, s83, 25
+; VI-NEXT:    v_writelane_b32 v26, s84, 26
+; VI-NEXT:    v_writelane_b32 v26, s85, 27
+; VI-NEXT:    v_writelane_b32 v26, s86, 28
+; VI-NEXT:    v_writelane_b32 v26, s87, 29
+; VI-NEXT:    v_writelane_b32 v26, s30, 30
+; VI-NEXT:    v_writelane_b32 v26, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v10
 ; VI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -16967,7 +16968,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s31, v2
 ; VI-NEXT:    v_readfirstlane_b32 s68, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v26, s87, 31
 ; VI-NEXT:    s_lshr_b32 s76, s29, 16
 ; VI-NEXT:    s_lshr_b32 s79, s28, 16
 ; VI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -17209,6 +17209,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v26, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -17235,38 +17236,37 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
 ; VI-NEXT:    v_mov_b32_e32 v24, s60
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
-; VI-NEXT:    v_readlane_b32 s87, v26, 31
-; VI-NEXT:    v_readlane_b32 s86, v26, 30
-; VI-NEXT:    v_readlane_b32 s85, v26, 29
-; VI-NEXT:    v_readlane_b32 s84, v26, 28
-; VI-NEXT:    v_readlane_b32 s83, v26, 27
-; VI-NEXT:    v_readlane_b32 s82, v26, 26
-; VI-NEXT:    v_readlane_b32 s81, v26, 25
-; VI-NEXT:    v_readlane_b32 s80, v26, 24
-; VI-NEXT:    v_readlane_b32 s71, v26, 23
-; VI-NEXT:    v_readlane_b32 s70, v26, 22
-; VI-NEXT:    v_readlane_b32 s69, v26, 21
-; VI-NEXT:    v_readlane_b32 s68, v26, 20
-; VI-NEXT:    v_readlane_b32 s67, v26, 19
-; VI-NEXT:    v_readlane_b32 s66, v26, 18
-; VI-NEXT:    v_readlane_b32 s65, v26, 17
-; VI-NEXT:    v_readlane_b32 s64, v26, 16
-; VI-NEXT:    v_readlane_b32 s55, v26, 15
-; VI-NEXT:    v_readlane_b32 s54, v26, 14
-; VI-NEXT:    v_readlane_b32 s53, v26, 13
-; VI-NEXT:    v_readlane_b32 s52, v26, 12
-; VI-NEXT:    v_readlane_b32 s51, v26, 11
-; VI-NEXT:    v_readlane_b32 s50, v26, 10
-; VI-NEXT:    v_readlane_b32 s49, v26, 9
-; VI-NEXT:    v_readlane_b32 s48, v26, 8
-; VI-NEXT:    v_readlane_b32 s39, v26, 7
-; VI-NEXT:    v_readlane_b32 s38, v26, 6
-; VI-NEXT:    v_readlane_b32 s37, v26, 5
-; VI-NEXT:    v_readlane_b32 s36, v26, 4
-; VI-NEXT:    v_readlane_b32 s35, v26, 3
-; VI-NEXT:    v_readlane_b32 s34, v26, 2
-; VI-NEXT:    v_readlane_b32 s31, v26, 1
-; VI-NEXT:    v_readlane_b32 s30, v26, 0
+; VI-NEXT:    v_readlane_b32 s31, v26, 31
+; VI-NEXT:    v_readlane_b32 s87, v26, 29
+; VI-NEXT:    v_readlane_b32 s86, v26, 28
+; VI-NEXT:    v_readlane_b32 s85, v26, 27
+; VI-NEXT:    v_readlane_b32 s84, v26, 26
+; VI-NEXT:    v_readlane_b32 s83, v26, 25
+; VI-NEXT:    v_readlane_b32 s82, v26, 24
+; VI-NEXT:    v_readlane_b32 s81, v26, 23
+; VI-NEXT:    v_readlane_b32 s80, v26, 22
+; VI-NEXT:    v_readlane_b32 s71, v26, 21
+; VI-NEXT:    v_readlane_b32 s70, v26, 20
+; VI-NEXT:    v_readlane_b32 s69, v26, 19
+; VI-NEXT:    v_readlane_b32 s68, v26, 18
+; VI-NEXT:    v_readlane_b32 s67, v26, 17
+; VI-NEXT:    v_readlane_b32 s66, v26, 16
+; VI-NEXT:    v_readlane_b32 s65, v26, 15
+; VI-NEXT:    v_readlane_b32 s64, v26, 14
+; VI-NEXT:    v_readlane_b32 s55, v26, 13
+; VI-NEXT:    v_readlane_b32 s54, v26, 12
+; VI-NEXT:    v_readlane_b32 s53, v26, 11
+; VI-NEXT:    v_readlane_b32 s52, v26, 10
+; VI-NEXT:    v_readlane_b32 s51, v26, 9
+; VI-NEXT:    v_readlane_b32 s50, v26, 8
+; VI-NEXT:    v_readlane_b32 s49, v26, 7
+; VI-NEXT:    v_readlane_b32 s48, v26, 6
+; VI-NEXT:    v_readlane_b32 s39, v26, 5
+; VI-NEXT:    v_readlane_b32 s38, v26, 4
+; VI-NEXT:    v_readlane_b32 s37, v26, 3
+; VI-NEXT:    v_readlane_b32 s36, v26, 2
+; VI-NEXT:    v_readlane_b32 s35, v26, 1
+; VI-NEXT:    v_readlane_b32 s34, v26, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17580,7 +17580,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
+define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18312,7 +18312,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v26f32_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18320,15 +18320,15 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v45, s30, 0
-; SI-NEXT:    v_writelane_b32 v45, s31, 1
-; SI-NEXT:    v_writelane_b32 v45, s34, 2
-; SI-NEXT:    v_writelane_b32 v45, s35, 3
-; SI-NEXT:    v_writelane_b32 v45, s36, 4
-; SI-NEXT:    v_writelane_b32 v45, s37, 5
-; SI-NEXT:    v_writelane_b32 v45, s38, 6
+; SI-NEXT:    v_writelane_b32 v45, s34, 0
+; SI-NEXT:    v_writelane_b32 v45, s35, 1
+; SI-NEXT:    v_writelane_b32 v45, s36, 2
+; SI-NEXT:    v_writelane_b32 v45, s37, 3
+; SI-NEXT:    v_writelane_b32 v45, s38, 4
+; SI-NEXT:    v_writelane_b32 v45, s39, 5
+; SI-NEXT:    v_writelane_b32 v45, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v45, s39, 7
+; SI-NEXT:    v_writelane_b32 v45, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -18347,7 +18347,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v45, s48, 8
+; SI-NEXT:    v_writelane_b32 v45, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s5, 16
@@ -18589,6 +18589,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v48
+; SI-NEXT:    v_readlane_b32 s30, v45, 7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v38
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v37
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v36
@@ -18602,15 +18603,14 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v28
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_readlane_b32 s48, v45, 8
-; SI-NEXT:    v_readlane_b32 s39, v45, 7
-; SI-NEXT:    v_readlane_b32 s38, v45, 6
-; SI-NEXT:    v_readlane_b32 s37, v45, 5
-; SI-NEXT:    v_readlane_b32 s36, v45, 4
-; SI-NEXT:    v_readlane_b32 s35, v45, 3
-; SI-NEXT:    v_readlane_b32 s34, v45, 2
-; SI-NEXT:    v_readlane_b32 s31, v45, 1
-; SI-NEXT:    v_readlane_b32 s30, v45, 0
+; SI-NEXT:    v_readlane_b32 s31, v45, 8
+; SI-NEXT:    v_readlane_b32 s48, v45, 6
+; SI-NEXT:    v_readlane_b32 s39, v45, 5
+; SI-NEXT:    v_readlane_b32 s38, v45, 4
+; SI-NEXT:    v_readlane_b32 s37, v45, 3
+; SI-NEXT:    v_readlane_b32 s36, v45, 2
+; SI-NEXT:    v_readlane_b32 s35, v45, 1
+; SI-NEXT:    v_readlane_b32 s34, v45, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19514,7 +19514,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
+define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20784,7 +20784,7 @@ end:
   ret <26 x float> %phi
 }
 
-define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v26f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20792,33 +20792,33 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s30, 26
 ; SI-NEXT:    v_readfirstlane_b32 s6, v11
 ; SI-NEXT:    v_readfirstlane_b32 s8, v10
 ; SI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -20831,7 +20831,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
+; SI-NEXT:    v_writelane_b32 v32, s31, 27
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
@@ -21188,34 +21188,34 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 26
+; SI-NEXT:    v_readlane_b32 s31, v32, 27
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21228,37 +21228,38 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s6, v11
 ; VI-NEXT:    v_readfirstlane_b32 s8, v10
 ; VI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -21271,7 +21272,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s30, v2
 ; VI-NEXT:    v_readfirstlane_b32 s35, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s74, s29, 16
 ; VI-NEXT:    s_lshr_b32 s77, s28, 16
 ; VI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -21525,38 +21525,38 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21868,7 +21868,7 @@ end:
   ret <26 x float> %phi
 }
 
-define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) {
+define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22049,7 +22049,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22363,7 +22363,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) {
+define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22485,7 +22485,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22961,7 +22961,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
+define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23733,7 +23733,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23741,15 +23741,15 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
+; SI-NEXT:    v_writelane_b32 v26, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -23763,7 +23763,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s15, v1
 ; SI-NEXT:    s_cmp_lg_u32 s14, 0
 ; SI-NEXT:    v_readfirstlane_b32 s14, v0
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
+; SI-NEXT:    v_writelane_b32 v26, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s92, s5, 16
@@ -23925,6 +23925,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s40, s92, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s41
 ; SI-NEXT:    s_or_b32 s5, s5, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 7
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -23951,15 +23952,14 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v23, s7
 ; SI-NEXT:    v_mov_b32_e32 v24, s4
 ; SI-NEXT:    v_mov_b32_e32 v25, s5
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 8
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24594,7 +24594,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
+define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25743,7 +25743,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25751,33 +25751,34 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s30, 26
+; SI-NEXT:    v_writelane_b32 v26, s31, 27
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
 ; SI-NEXT:    v_readfirstlane_b32 s9, v10
 ; SI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -25790,7 +25791,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s93, v2
 ; SI-NEXT:    v_readfirstlane_b32 s30, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
 ; SI-NEXT:    s_lshr_b32 s76, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -26032,6 +26032,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v26, 26
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -26058,34 +26059,33 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s60
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 27
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26101,37 +26101,38 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v26, s30, 0
-; VI-NEXT:    v_writelane_b32 v26, s31, 1
-; VI-NEXT:    v_writelane_b32 v26, s34, 2
-; VI-NEXT:    v_writelane_b32 v26, s35, 3
-; VI-NEXT:    v_writelane_b32 v26, s36, 4
-; VI-NEXT:    v_writelane_b32 v26, s37, 5
-; VI-NEXT:    v_writelane_b32 v26, s38, 6
-; VI-NEXT:    v_writelane_b32 v26, s39, 7
-; VI-NEXT:    v_writelane_b32 v26, s48, 8
-; VI-NEXT:    v_writelane_b32 v26, s49, 9
-; VI-NEXT:    v_writelane_b32 v26, s50, 10
-; VI-NEXT:    v_writelane_b32 v26, s51, 11
-; VI-NEXT:    v_writelane_b32 v26, s52, 12
-; VI-NEXT:    v_writelane_b32 v26, s53, 13
-; VI-NEXT:    v_writelane_b32 v26, s54, 14
-; VI-NEXT:    v_writelane_b32 v26, s55, 15
-; VI-NEXT:    v_writelane_b32 v26, s64, 16
-; VI-NEXT:    v_writelane_b32 v26, s65, 17
-; VI-NEXT:    v_writelane_b32 v26, s66, 18
-; VI-NEXT:    v_writelane_b32 v26, s67, 19
-; VI-NEXT:    v_writelane_b32 v26, s68, 20
-; VI-NEXT:    v_writelane_b32 v26, s69, 21
-; VI-NEXT:    v_writelane_b32 v26, s70, 22
-; VI-NEXT:    v_writelane_b32 v26, s71, 23
-; VI-NEXT:    v_writelane_b32 v26, s80, 24
-; VI-NEXT:    v_writelane_b32 v26, s81, 25
-; VI-NEXT:    v_writelane_b32 v26, s82, 26
-; VI-NEXT:    v_writelane_b32 v26, s83, 27
-; VI-NEXT:    v_writelane_b32 v26, s84, 28
-; VI-NEXT:    v_writelane_b32 v26, s85, 29
-; VI-NEXT:    v_writelane_b32 v26, s86, 30
+; VI-NEXT:    v_writelane_b32 v26, s34, 0
+; VI-NEXT:    v_writelane_b32 v26, s35, 1
+; VI-NEXT:    v_writelane_b32 v26, s36, 2
+; VI-NEXT:    v_writelane_b32 v26, s37, 3
+; VI-NEXT:    v_writelane_b32 v26, s38, 4
+; VI-NEXT:    v_writelane_b32 v26, s39, 5
+; VI-NEXT:    v_writelane_b32 v26, s48, 6
+; VI-NEXT:    v_writelane_b32 v26, s49, 7
+; VI-NEXT:    v_writelane_b32 v26, s50, 8
+; VI-NEXT:    v_writelane_b32 v26, s51, 9
+; VI-NEXT:    v_writelane_b32 v26, s52, 10
+; VI-NEXT:    v_writelane_b32 v26, s53, 11
+; VI-NEXT:    v_writelane_b32 v26, s54, 12
+; VI-NEXT:    v_writelane_b32 v26, s55, 13
+; VI-NEXT:    v_writelane_b32 v26, s64, 14
+; VI-NEXT:    v_writelane_b32 v26, s65, 15
+; VI-NEXT:    v_writelane_b32 v26, s66, 16
+; VI-NEXT:    v_writelane_b32 v26, s67, 17
+; VI-NEXT:    v_writelane_b32 v26, s68, 18
+; VI-NEXT:    v_writelane_b32 v26, s69, 19
+; VI-NEXT:    v_writelane_b32 v26, s70, 20
+; VI-NEXT:    v_writelane_b32 v26, s71, 21
+; VI-NEXT:    v_writelane_b32 v26, s80, 22
+; VI-NEXT:    v_writelane_b32 v26, s81, 23
+; VI-NEXT:    v_writelane_b32 v26, s82, 24
+; VI-NEXT:    v_writelane_b32 v26, s83, 25
+; VI-NEXT:    v_writelane_b32 v26, s84, 26
+; VI-NEXT:    v_writelane_b32 v26, s85, 27
+; VI-NEXT:    v_writelane_b32 v26, s86, 28
+; VI-NEXT:    v_writelane_b32 v26, s87, 29
+; VI-NEXT:    v_writelane_b32 v26, s30, 30
+; VI-NEXT:    v_writelane_b32 v26, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v10
 ; VI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -26144,7 +26145,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s31, v2
 ; VI-NEXT:    v_readfirstlane_b32 s68, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v26, s87, 31
 ; VI-NEXT:    s_lshr_b32 s76, s29, 16
 ; VI-NEXT:    s_lshr_b32 s79, s28, 16
 ; VI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -26386,6 +26386,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v26, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -26412,38 +26413,37 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
 ; VI-NEXT:    v_mov_b32_e32 v24, s60
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
-; VI-NEXT:    v_readlane_b32 s87, v26, 31
-; VI-NEXT:    v_readlane_b32 s86, v26, 30
-; VI-NEXT:    v_readlane_b32 s85, v26, 29
-; VI-NEXT:    v_readlane_b32 s84, v26, 28
-; VI-NEXT:    v_readlane_b32 s83, v26, 27
-; VI-NEXT:    v_readlane_b32 s82, v26, 26
-; VI-NEXT:    v_readlane_b32 s81, v26, 25
-; VI-NEXT:    v_readlane_b32 s80, v26, 24
-; VI-NEXT:    v_readlane_b32 s71, v26, 23
-; VI-NEXT:    v_readlane_b32 s70, v26, 22
-; VI-NEXT:    v_readlane_b32 s69, v26, 21
-; VI-NEXT:    v_readlane_b32 s68, v26, 20
-; VI-NEXT:    v_readlane_b32 s67, v26, 19
-; VI-NEXT:    v_readlane_b32 s66, v26, 18
-; VI-NEXT:    v_readlane_b32 s65, v26, 17
-; VI-NEXT:    v_readlane_b32 s64, v26, 16
-; VI-NEXT:    v_readlane_b32 s55, v26, 15
-; VI-NEXT:    v_readlane_b32 s54, v26, 14
-; VI-NEXT:    v_readlane_b32 s53, v26, 13
-; VI-NEXT:    v_readlane_b32 s52, v26, 12
-; VI-NEXT:    v_readlane_b32 s51, v26, 11
-; VI-NEXT:    v_readlane_b32 s50, v26, 10
-; VI-NEXT:    v_readlane_b32 s49, v26, 9
-; VI-NEXT:    v_readlane_b32 s48, v26, 8
-; VI-NEXT:    v_readlane_b32 s39, v26, 7
-; VI-NEXT:    v_readlane_b32 s38, v26, 6
-; VI-NEXT:    v_readlane_b32 s37, v26, 5
-; VI-NEXT:    v_readlane_b32 s36, v26, 4
-; VI-NEXT:    v_readlane_b32 s35, v26, 3
-; VI-NEXT:    v_readlane_b32 s34, v26, 2
-; VI-NEXT:    v_readlane_b32 s31, v26, 1
-; VI-NEXT:    v_readlane_b32 s30, v26, 0
+; VI-NEXT:    v_readlane_b32 s31, v26, 31
+; VI-NEXT:    v_readlane_b32 s87, v26, 29
+; VI-NEXT:    v_readlane_b32 s86, v26, 28
+; VI-NEXT:    v_readlane_b32 s85, v26, 27
+; VI-NEXT:    v_readlane_b32 s84, v26, 26
+; VI-NEXT:    v_readlane_b32 s83, v26, 25
+; VI-NEXT:    v_readlane_b32 s82, v26, 24
+; VI-NEXT:    v_readlane_b32 s81, v26, 23
+; VI-NEXT:    v_readlane_b32 s80, v26, 22
+; VI-NEXT:    v_readlane_b32 s71, v26, 21
+; VI-NEXT:    v_readlane_b32 s70, v26, 20
+; VI-NEXT:    v_readlane_b32 s69, v26, 19
+; VI-NEXT:    v_readlane_b32 s68, v26, 18
+; VI-NEXT:    v_readlane_b32 s67, v26, 17
+; VI-NEXT:    v_readlane_b32 s66, v26, 16
+; VI-NEXT:    v_readlane_b32 s65, v26, 15
+; VI-NEXT:    v_readlane_b32 s64, v26, 14
+; VI-NEXT:    v_readlane_b32 s55, v26, 13
+; VI-NEXT:    v_readlane_b32 s54, v26, 12
+; VI-NEXT:    v_readlane_b32 s53, v26, 11
+; VI-NEXT:    v_readlane_b32 s52, v26, 10
+; VI-NEXT:    v_readlane_b32 s51, v26, 9
+; VI-NEXT:    v_readlane_b32 s50, v26, 8
+; VI-NEXT:    v_readlane_b32 s49, v26, 7
+; VI-NEXT:    v_readlane_b32 s48, v26, 6
+; VI-NEXT:    v_readlane_b32 s39, v26, 5
+; VI-NEXT:    v_readlane_b32 s38, v26, 4
+; VI-NEXT:    v_readlane_b32 s37, v26, 3
+; VI-NEXT:    v_readlane_b32 s36, v26, 2
+; VI-NEXT:    v_readlane_b32 s35, v26, 1
+; VI-NEXT:    v_readlane_b32 s34, v26, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26757,7 +26757,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
+define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27529,7 +27529,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13i64_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27537,15 +27537,15 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s14, v12
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
+; SI-NEXT:    v_writelane_b32 v26, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s5, v11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v10
 ; SI-NEXT:    v_readfirstlane_b32 s7, v9
@@ -27559,7 +27559,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s15, v1
 ; SI-NEXT:    s_cmp_lg_u32 s14, 0
 ; SI-NEXT:    v_readfirstlane_b32 s14, v0
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
+; SI-NEXT:    v_writelane_b32 v26, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s92, s5, 16
@@ -27721,6 +27721,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s40, s92, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s41
 ; SI-NEXT:    s_or_b32 s5, s5, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 7
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -27747,15 +27748,14 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v23, s7
 ; SI-NEXT:    v_mov_b32_e32 v24, s4
 ; SI-NEXT:    v_mov_b32_e32 v25, s5
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 8
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28390,7 +28390,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
+define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29660,7 +29660,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v13i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29668,33 +29668,33 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s30, 26
 ; SI-NEXT:    v_readfirstlane_b32 s6, v11
 ; SI-NEXT:    v_readfirstlane_b32 s8, v10
 ; SI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -29707,7 +29707,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
+; SI-NEXT:    v_writelane_b32 v32, s31, 27
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
@@ -30064,34 +30064,34 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 26
+; SI-NEXT:    v_readlane_b32 s31, v32, 27
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30104,37 +30104,38 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s6, v11
 ; VI-NEXT:    v_readfirstlane_b32 s8, v10
 ; VI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -30147,7 +30148,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s30, v2
 ; VI-NEXT:    v_readfirstlane_b32 s35, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s74, s29, 16
 ; VI-NEXT:    s_lshr_b32 s77, s28, 16
 ; VI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -30401,38 +30401,38 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30744,7 +30744,7 @@ end:
   ret <13 x i64> %phi
 }
 
-define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
+define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31437,7 +31437,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31445,15 +31445,15 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v45, s30, 0
-; SI-NEXT:    v_writelane_b32 v45, s31, 1
-; SI-NEXT:    v_writelane_b32 v45, s34, 2
-; SI-NEXT:    v_writelane_b32 v45, s35, 3
-; SI-NEXT:    v_writelane_b32 v45, s36, 4
-; SI-NEXT:    v_writelane_b32 v45, s37, 5
-; SI-NEXT:    v_writelane_b32 v45, s38, 6
+; SI-NEXT:    v_writelane_b32 v45, s34, 0
+; SI-NEXT:    v_writelane_b32 v45, s35, 1
+; SI-NEXT:    v_writelane_b32 v45, s36, 2
+; SI-NEXT:    v_writelane_b32 v45, s37, 3
+; SI-NEXT:    v_writelane_b32 v45, s38, 4
+; SI-NEXT:    v_writelane_b32 v45, s39, 5
+; SI-NEXT:    v_writelane_b32 v45, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
-; SI-NEXT:    v_writelane_b32 v45, s39, 7
+; SI-NEXT:    v_writelane_b32 v45, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s13, v11
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
 ; SI-NEXT:    v_readfirstlane_b32 s15, v9
@@ -31472,7 +31472,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v45, s48, 8
+; SI-NEXT:    v_writelane_b32 v45, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s13, 16
@@ -31705,6 +31705,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v48
+; SI-NEXT:    v_readlane_b32 s30, v45, 7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v38
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v37
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v36
@@ -31718,15 +31719,14 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v28
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_readlane_b32 s48, v45, 8
-; SI-NEXT:    v_readlane_b32 s39, v45, 7
-; SI-NEXT:    v_readlane_b32 s38, v45, 6
-; SI-NEXT:    v_readlane_b32 s37, v45, 5
-; SI-NEXT:    v_readlane_b32 s36, v45, 4
-; SI-NEXT:    v_readlane_b32 s35, v45, 3
-; SI-NEXT:    v_readlane_b32 s34, v45, 2
-; SI-NEXT:    v_readlane_b32 s31, v45, 1
-; SI-NEXT:    v_readlane_b32 s30, v45, 0
+; SI-NEXT:    v_readlane_b32 s31, v45, 8
+; SI-NEXT:    v_readlane_b32 s48, v45, 6
+; SI-NEXT:    v_readlane_b32 s39, v45, 5
+; SI-NEXT:    v_readlane_b32 s38, v45, 4
+; SI-NEXT:    v_readlane_b32 s37, v45, 3
+; SI-NEXT:    v_readlane_b32 s36, v45, 2
+; SI-NEXT:    v_readlane_b32 s35, v45, 1
+; SI-NEXT:    v_readlane_b32 s34, v45, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32578,7 +32578,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
+define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33727,7 +33727,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33735,33 +33735,34 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s30, 26
+; SI-NEXT:    v_writelane_b32 v26, s31, 27
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
 ; SI-NEXT:    v_readfirstlane_b32 s9, v10
 ; SI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -33774,7 +33775,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s93, v2
 ; SI-NEXT:    v_readfirstlane_b32 s30, v1
 ; SI-NEXT:    v_readfirstlane_b32 s35, v0
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
 ; SI-NEXT:    s_lshr_b32 s76, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -34016,6 +34016,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v26, 26
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -34042,34 +34043,33 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s60
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 27
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34085,37 +34085,38 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v26, s30, 0
-; VI-NEXT:    v_writelane_b32 v26, s31, 1
-; VI-NEXT:    v_writelane_b32 v26, s34, 2
-; VI-NEXT:    v_writelane_b32 v26, s35, 3
-; VI-NEXT:    v_writelane_b32 v26, s36, 4
-; VI-NEXT:    v_writelane_b32 v26, s37, 5
-; VI-NEXT:    v_writelane_b32 v26, s38, 6
-; VI-NEXT:    v_writelane_b32 v26, s39, 7
-; VI-NEXT:    v_writelane_b32 v26, s48, 8
-; VI-NEXT:    v_writelane_b32 v26, s49, 9
-; VI-NEXT:    v_writelane_b32 v26, s50, 10
-; VI-NEXT:    v_writelane_b32 v26, s51, 11
-; VI-NEXT:    v_writelane_b32 v26, s52, 12
-; VI-NEXT:    v_writelane_b32 v26, s53, 13
-; VI-NEXT:    v_writelane_b32 v26, s54, 14
-; VI-NEXT:    v_writelane_b32 v26, s55, 15
-; VI-NEXT:    v_writelane_b32 v26, s64, 16
-; VI-NEXT:    v_writelane_b32 v26, s65, 17
-; VI-NEXT:    v_writelane_b32 v26, s66, 18
-; VI-NEXT:    v_writelane_b32 v26, s67, 19
-; VI-NEXT:    v_writelane_b32 v26, s68, 20
-; VI-NEXT:    v_writelane_b32 v26, s69, 21
-; VI-NEXT:    v_writelane_b32 v26, s70, 22
-; VI-NEXT:    v_writelane_b32 v26, s71, 23
-; VI-NEXT:    v_writelane_b32 v26, s80, 24
-; VI-NEXT:    v_writelane_b32 v26, s81, 25
-; VI-NEXT:    v_writelane_b32 v26, s82, 26
-; VI-NEXT:    v_writelane_b32 v26, s83, 27
-; VI-NEXT:    v_writelane_b32 v26, s84, 28
-; VI-NEXT:    v_writelane_b32 v26, s85, 29
-; VI-NEXT:    v_writelane_b32 v26, s86, 30
+; VI-NEXT:    v_writelane_b32 v26, s34, 0
+; VI-NEXT:    v_writelane_b32 v26, s35, 1
+; VI-NEXT:    v_writelane_b32 v26, s36, 2
+; VI-NEXT:    v_writelane_b32 v26, s37, 3
+; VI-NEXT:    v_writelane_b32 v26, s38, 4
+; VI-NEXT:    v_writelane_b32 v26, s39, 5
+; VI-NEXT:    v_writelane_b32 v26, s48, 6
+; VI-NEXT:    v_writelane_b32 v26, s49, 7
+; VI-NEXT:    v_writelane_b32 v26, s50, 8
+; VI-NEXT:    v_writelane_b32 v26, s51, 9
+; VI-NEXT:    v_writelane_b32 v26, s52, 10
+; VI-NEXT:    v_writelane_b32 v26, s53, 11
+; VI-NEXT:    v_writelane_b32 v26, s54, 12
+; VI-NEXT:    v_writelane_b32 v26, s55, 13
+; VI-NEXT:    v_writelane_b32 v26, s64, 14
+; VI-NEXT:    v_writelane_b32 v26, s65, 15
+; VI-NEXT:    v_writelane_b32 v26, s66, 16
+; VI-NEXT:    v_writelane_b32 v26, s67, 17
+; VI-NEXT:    v_writelane_b32 v26, s68, 18
+; VI-NEXT:    v_writelane_b32 v26, s69, 19
+; VI-NEXT:    v_writelane_b32 v26, s70, 20
+; VI-NEXT:    v_writelane_b32 v26, s71, 21
+; VI-NEXT:    v_writelane_b32 v26, s80, 22
+; VI-NEXT:    v_writelane_b32 v26, s81, 23
+; VI-NEXT:    v_writelane_b32 v26, s82, 24
+; VI-NEXT:    v_writelane_b32 v26, s83, 25
+; VI-NEXT:    v_writelane_b32 v26, s84, 26
+; VI-NEXT:    v_writelane_b32 v26, s85, 27
+; VI-NEXT:    v_writelane_b32 v26, s86, 28
+; VI-NEXT:    v_writelane_b32 v26, s87, 29
+; VI-NEXT:    v_writelane_b32 v26, s30, 30
+; VI-NEXT:    v_writelane_b32 v26, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v10
 ; VI-NEXT:    v_readfirstlane_b32 s11, v9
@@ -34128,7 +34129,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s31, v2
 ; VI-NEXT:    v_readfirstlane_b32 s68, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v26, s87, 31
 ; VI-NEXT:    s_lshr_b32 s76, s29, 16
 ; VI-NEXT:    s_lshr_b32 s79, s28, 16
 ; VI-NEXT:    s_lshr_b32 s89, s27, 16
@@ -34370,6 +34370,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s61, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v26, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -34396,38 +34397,37 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v23, s59
 ; VI-NEXT:    v_mov_b32_e32 v24, s60
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
-; VI-NEXT:    v_readlane_b32 s87, v26, 31
-; VI-NEXT:    v_readlane_b32 s86, v26, 30
-; VI-NEXT:    v_readlane_b32 s85, v26, 29
-; VI-NEXT:    v_readlane_b32 s84, v26, 28
-; VI-NEXT:    v_readlane_b32 s83, v26, 27
-; VI-NEXT:    v_readlane_b32 s82, v26, 26
-; VI-NEXT:    v_readlane_b32 s81, v26, 25
-; VI-NEXT:    v_readlane_b32 s80, v26, 24
-; VI-NEXT:    v_readlane_b32 s71, v26, 23
-; VI-NEXT:    v_readlane_b32 s70, v26, 22
-; VI-NEXT:    v_readlane_b32 s69, v26, 21
-; VI-NEXT:    v_readlane_b32 s68, v26, 20
-; VI-NEXT:    v_readlane_b32 s67, v26, 19
-; VI-NEXT:    v_readlane_b32 s66, v26, 18
-; VI-NEXT:    v_readlane_b32 s65, v26, 17
-; VI-NEXT:    v_readlane_b32 s64, v26, 16
-; VI-NEXT:    v_readlane_b32 s55, v26, 15
-; VI-NEXT:    v_readlane_b32 s54, v26, 14
-; VI-NEXT:    v_readlane_b32 s53, v26, 13
-; VI-NEXT:    v_readlane_b32 s52, v26, 12
-; VI-NEXT:    v_readlane_b32 s51, v26, 11
-; VI-NEXT:    v_readlane_b32 s50, v26, 10
-; VI-NEXT:    v_readlane_b32 s49, v26, 9
-; VI-NEXT:    v_readlane_b32 s48, v26, 8
-; VI-NEXT:    v_readlane_b32 s39, v26, 7
-; VI-NEXT:    v_readlane_b32 s38, v26, 6
-; VI-NEXT:    v_readlane_b32 s37, v26, 5
-; VI-NEXT:    v_readlane_b32 s36, v26, 4
-; VI-NEXT:    v_readlane_b32 s35, v26, 3
-; VI-NEXT:    v_readlane_b32 s34, v26, 2
-; VI-NEXT:    v_readlane_b32 s31, v26, 1
-; VI-NEXT:    v_readlane_b32 s30, v26, 0
+; VI-NEXT:    v_readlane_b32 s31, v26, 31
+; VI-NEXT:    v_readlane_b32 s87, v26, 29
+; VI-NEXT:    v_readlane_b32 s86, v26, 28
+; VI-NEXT:    v_readlane_b32 s85, v26, 27
+; VI-NEXT:    v_readlane_b32 s84, v26, 26
+; VI-NEXT:    v_readlane_b32 s83, v26, 25
+; VI-NEXT:    v_readlane_b32 s82, v26, 24
+; VI-NEXT:    v_readlane_b32 s81, v26, 23
+; VI-NEXT:    v_readlane_b32 s80, v26, 22
+; VI-NEXT:    v_readlane_b32 s71, v26, 21
+; VI-NEXT:    v_readlane_b32 s70, v26, 20
+; VI-NEXT:    v_readlane_b32 s69, v26, 19
+; VI-NEXT:    v_readlane_b32 s68, v26, 18
+; VI-NEXT:    v_readlane_b32 s67, v26, 17
+; VI-NEXT:    v_readlane_b32 s66, v26, 16
+; VI-NEXT:    v_readlane_b32 s65, v26, 15
+; VI-NEXT:    v_readlane_b32 s64, v26, 14
+; VI-NEXT:    v_readlane_b32 s55, v26, 13
+; VI-NEXT:    v_readlane_b32 s54, v26, 12
+; VI-NEXT:    v_readlane_b32 s53, v26, 11
+; VI-NEXT:    v_readlane_b32 s52, v26, 10
+; VI-NEXT:    v_readlane_b32 s51, v26, 9
+; VI-NEXT:    v_readlane_b32 s50, v26, 8
+; VI-NEXT:    v_readlane_b32 s49, v26, 7
+; VI-NEXT:    v_readlane_b32 s48, v26, 6
+; VI-NEXT:    v_readlane_b32 s39, v26, 5
+; VI-NEXT:    v_readlane_b32 s38, v26, 4
+; VI-NEXT:    v_readlane_b32 s37, v26, 3
+; VI-NEXT:    v_readlane_b32 s36, v26, 2
+; VI-NEXT:    v_readlane_b32 s35, v26, 1
+; VI-NEXT:    v_readlane_b32 s34, v26, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34741,7 +34741,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
+define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35434,7 +35434,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v13f64_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35442,15 +35442,15 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v45, s30, 0
-; SI-NEXT:    v_writelane_b32 v45, s31, 1
-; SI-NEXT:    v_writelane_b32 v45, s34, 2
-; SI-NEXT:    v_writelane_b32 v45, s35, 3
-; SI-NEXT:    v_writelane_b32 v45, s36, 4
-; SI-NEXT:    v_writelane_b32 v45, s37, 5
-; SI-NEXT:    v_writelane_b32 v45, s38, 6
+; SI-NEXT:    v_writelane_b32 v45, s34, 0
+; SI-NEXT:    v_writelane_b32 v45, s35, 1
+; SI-NEXT:    v_writelane_b32 v45, s36, 2
+; SI-NEXT:    v_writelane_b32 v45, s37, 3
+; SI-NEXT:    v_writelane_b32 v45, s38, 4
+; SI-NEXT:    v_writelane_b32 v45, s39, 5
+; SI-NEXT:    v_writelane_b32 v45, s48, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
-; SI-NEXT:    v_writelane_b32 v45, s39, 7
+; SI-NEXT:    v_writelane_b32 v45, s30, 7
 ; SI-NEXT:    v_readfirstlane_b32 s13, v11
 ; SI-NEXT:    v_readfirstlane_b32 s12, v10
 ; SI-NEXT:    v_readfirstlane_b32 s15, v9
@@ -35469,7 +35469,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v45, s48, 8
+; SI-NEXT:    v_writelane_b32 v45, s31, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s48, s13, 16
@@ -35702,6 +35702,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v48
+; SI-NEXT:    v_readlane_b32 s30, v45, 7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v38
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v37
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v36
@@ -35715,15 +35716,14 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v28
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_readlane_b32 s48, v45, 8
-; SI-NEXT:    v_readlane_b32 s39, v45, 7
-; SI-NEXT:    v_readlane_b32 s38, v45, 6
-; SI-NEXT:    v_readlane_b32 s37, v45, 5
-; SI-NEXT:    v_readlane_b32 s36, v45, 4
-; SI-NEXT:    v_readlane_b32 s35, v45, 3
-; SI-NEXT:    v_readlane_b32 s34, v45, 2
-; SI-NEXT:    v_readlane_b32 s31, v45, 1
-; SI-NEXT:    v_readlane_b32 s30, v45, 0
+; SI-NEXT:    v_readlane_b32 s31, v45, 8
+; SI-NEXT:    v_readlane_b32 s48, v45, 6
+; SI-NEXT:    v_readlane_b32 s39, v45, 5
+; SI-NEXT:    v_readlane_b32 s38, v45, 4
+; SI-NEXT:    v_readlane_b32 s37, v45, 3
+; SI-NEXT:    v_readlane_b32 s36, v45, 2
+; SI-NEXT:    v_readlane_b32 s35, v45, 1
+; SI-NEXT:    v_readlane_b32 s34, v45, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -36575,7 +36575,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
+define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37845,7 +37845,7 @@ end:
   ret <13 x double> %phi
 }
 
-define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v13f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37853,33 +37853,33 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s30, 26
 ; SI-NEXT:    v_readfirstlane_b32 s6, v11
 ; SI-NEXT:    v_readfirstlane_b32 s8, v10
 ; SI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -37892,7 +37892,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s88, v2
 ; SI-NEXT:    v_readfirstlane_b32 s91, v1
 ; SI-NEXT:    v_readfirstlane_b32 s94, v0
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
+; SI-NEXT:    v_writelane_b32 v32, s31, 27
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s93, s28, 16
 ; SI-NEXT:    s_lshr_b32 s30, s27, 16
@@ -38249,34 +38249,34 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 26
+; SI-NEXT:    v_readlane_b32 s31, v32, 27
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -38289,37 +38289,38 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s6, v11
 ; VI-NEXT:    v_readfirstlane_b32 s8, v10
 ; VI-NEXT:    v_readfirstlane_b32 s10, v9
@@ -38332,7 +38333,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s30, v2
 ; VI-NEXT:    v_readfirstlane_b32 s35, v1
 ; VI-NEXT:    v_readfirstlane_b32 s71, v0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s74, s29, 16
 ; VI-NEXT:    s_lshr_b32 s77, s28, 16
 ; VI-NEXT:    s_lshr_b32 s88, s27, 16
@@ -38586,38 +38586,38 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -38929,7 +38929,7 @@ end:
   ret <13 x double> %phi
 }
 
-define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
+define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v52f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40155,7 +40155,7 @@ end:
   ret <52 x half> %phi
 }
 
-define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52i16_to_v52f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40163,42 +40163,41 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v26, s30, 0
-; SI-NEXT:    v_writelane_b32 v26, s31, 1
-; SI-NEXT:    v_writelane_b32 v26, s34, 2
-; SI-NEXT:    v_writelane_b32 v26, s35, 3
-; SI-NEXT:    v_writelane_b32 v26, s36, 4
-; SI-NEXT:    v_writelane_b32 v26, s37, 5
-; SI-NEXT:    v_writelane_b32 v26, s38, 6
-; SI-NEXT:    v_writelane_b32 v26, s39, 7
-; SI-NEXT:    v_writelane_b32 v26, s48, 8
-; SI-NEXT:    v_writelane_b32 v26, s49, 9
-; SI-NEXT:    v_writelane_b32 v26, s50, 10
-; SI-NEXT:    v_writelane_b32 v26, s51, 11
-; SI-NEXT:    v_writelane_b32 v26, s52, 12
-; SI-NEXT:    v_writelane_b32 v26, s53, 13
-; SI-NEXT:    v_writelane_b32 v26, s54, 14
-; SI-NEXT:    v_writelane_b32 v26, s55, 15
-; SI-NEXT:    v_writelane_b32 v26, s64, 16
-; SI-NEXT:    v_writelane_b32 v26, s65, 17
-; SI-NEXT:    v_writelane_b32 v26, s66, 18
-; SI-NEXT:    v_writelane_b32 v26, s67, 19
-; SI-NEXT:    v_writelane_b32 v26, s68, 20
-; SI-NEXT:    v_writelane_b32 v26, s69, 21
-; SI-NEXT:    v_writelane_b32 v26, s70, 22
-; SI-NEXT:    v_writelane_b32 v26, s71, 23
-; SI-NEXT:    v_writelane_b32 v26, s80, 24
-; SI-NEXT:    v_writelane_b32 v26, s81, 25
-; SI-NEXT:    v_writelane_b32 v26, s82, 26
-; SI-NEXT:    v_writelane_b32 v26, s83, 27
-; SI-NEXT:    v_writelane_b32 v26, s84, 28
-; SI-NEXT:    v_writelane_b32 v26, s85, 29
-; SI-NEXT:    v_writelane_b32 v26, s86, 30
-; SI-NEXT:    v_writelane_b32 v26, s87, 31
-; SI-NEXT:    v_writelane_b32 v26, s96, 32
-; SI-NEXT:    v_writelane_b32 v26, s97, 33
-; SI-NEXT:    v_writelane_b32 v26, s98, 34
-; SI-NEXT:    v_writelane_b32 v26, s99, 35
+; SI-NEXT:    v_writelane_b32 v26, s34, 0
+; SI-NEXT:    v_writelane_b32 v26, s35, 1
+; SI-NEXT:    v_writelane_b32 v26, s36, 2
+; SI-NEXT:    v_writelane_b32 v26, s37, 3
+; SI-NEXT:    v_writelane_b32 v26, s38, 4
+; SI-NEXT:    v_writelane_b32 v26, s39, 5
+; SI-NEXT:    v_writelane_b32 v26, s48, 6
+; SI-NEXT:    v_writelane_b32 v26, s49, 7
+; SI-NEXT:    v_writelane_b32 v26, s50, 8
+; SI-NEXT:    v_writelane_b32 v26, s51, 9
+; SI-NEXT:    v_writelane_b32 v26, s52, 10
+; SI-NEXT:    v_writelane_b32 v26, s53, 11
+; SI-NEXT:    v_writelane_b32 v26, s54, 12
+; SI-NEXT:    v_writelane_b32 v26, s55, 13
+; SI-NEXT:    v_writelane_b32 v26, s64, 14
+; SI-NEXT:    v_writelane_b32 v26, s65, 15
+; SI-NEXT:    v_writelane_b32 v26, s66, 16
+; SI-NEXT:    v_writelane_b32 v26, s67, 17
+; SI-NEXT:    v_writelane_b32 v26, s68, 18
+; SI-NEXT:    v_writelane_b32 v26, s69, 19
+; SI-NEXT:    v_writelane_b32 v26, s70, 20
+; SI-NEXT:    v_writelane_b32 v26, s71, 21
+; SI-NEXT:    v_writelane_b32 v26, s80, 22
+; SI-NEXT:    v_writelane_b32 v26, s81, 23
+; SI-NEXT:    v_writelane_b32 v26, s82, 24
+; SI-NEXT:    v_writelane_b32 v26, s83, 25
+; SI-NEXT:    v_writelane_b32 v26, s84, 26
+; SI-NEXT:    v_writelane_b32 v26, s85, 27
+; SI-NEXT:    v_writelane_b32 v26, s86, 28
+; SI-NEXT:    v_writelane_b32 v26, s87, 29
+; SI-NEXT:    v_writelane_b32 v26, s96, 30
+; SI-NEXT:    v_writelane_b32 v26, s97, 31
+; SI-NEXT:    v_writelane_b32 v26, s98, 32
+; SI-NEXT:    v_writelane_b32 v26, s99, 33
+; SI-NEXT:    v_writelane_b32 v26, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s85, v11
 ; SI-NEXT:    v_readfirstlane_b32 s99, v10
 ; SI-NEXT:    v_readfirstlane_b32 s81, v9
@@ -40211,6 +40210,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s97, v2
 ; SI-NEXT:    v_readfirstlane_b32 s84, v1
 ; SI-NEXT:    v_readfirstlane_b32 s86, v0
+; SI-NEXT:    v_writelane_b32 v26, s31, 35
 ; SI-NEXT:    s_lshr_b32 s54, s29, 16
 ; SI-NEXT:    s_lshr_b32 s91, s28, 16
 ; SI-NEXT:    s_lshr_b32 s53, s27, 16
@@ -40602,6 +40602,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s29, s41, 0xffff
 ; SI-NEXT:    s_lshl_b32 s40, s68, 16
 ; SI-NEXT:    s_or_b32 s29, s29, s40
+; SI-NEXT:    v_readlane_b32 s30, v26, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s14
 ; SI-NEXT:    v_mov_b32_e32 v1, s15
 ; SI-NEXT:    v_mov_b32_e32 v2, s12
@@ -40628,42 +40629,41 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v23, s27
 ; SI-NEXT:    v_mov_b32_e32 v24, s28
 ; SI-NEXT:    v_mov_b32_e32 v25, s29
-; SI-NEXT:    v_readlane_b32 s99, v26, 35
-; SI-NEXT:    v_readlane_b32 s98, v26, 34
-; SI-NEXT:    v_readlane_b32 s97, v26, 33
-; SI-NEXT:    v_readlane_b32 s96, v26, 32
-; SI-NEXT:    v_readlane_b32 s87, v26, 31
-; SI-NEXT:    v_readlane_b32 s86, v26, 30
-; SI-NEXT:    v_readlane_b32 s85, v26, 29
-; SI-NEXT:    v_readlane_b32 s84, v26, 28
-; SI-NEXT:    v_readlane_b32 s83, v26, 27
-; SI-NEXT:    v_readlane_b32 s82, v26, 26
-; SI-NEXT:    v_readlane_b32 s81, v26, 25
-; SI-NEXT:    v_readlane_b32 s80, v26, 24
-; SI-NEXT:    v_readlane_b32 s71, v26, 23
-; SI-NEXT:    v_readlane_b32 s70, v26, 22
-; SI-NEXT:    v_readlane_b32 s69, v26, 21
-; SI-NEXT:    v_readlane_b32 s68, v26, 20
-; SI-NEXT:    v_readlane_b32 s67, v26, 19
-; SI-NEXT:    v_readlane_b32 s66, v26, 18
-; SI-NEXT:    v_readlane_b32 s65, v26, 17
-; SI-NEXT:    v_readlane_b32 s64, v26, 16
-; SI-NEXT:    v_readlane_b32 s55, v26, 15
-; SI-NEXT:    v_readlane_b32 s54, v26, 14
-; SI-NEXT:    v_readlane_b32 s53, v26, 13
-; SI-NEXT:    v_readlane_b32 s52, v26, 12
-; SI-NEXT:    v_readlane_b32 s51, v26, 11
-; SI-NEXT:    v_readlane_b32 s50, v26, 10
-; SI-NEXT:    v_readlane_b32 s49, v26, 9
-; SI-NEXT:    v_readlane_b32 s48, v26, 8
-; SI-NEXT:    v_readlane_b32 s39, v26, 7
-; SI-NEXT:    v_readlane_b32 s38, v26, 6
-; SI-NEXT:    v_readlane_b32 s37, v26, 5
-; SI-NEXT:    v_readlane_b32 s36, v26, 4
-; SI-NEXT:    v_readlane_b32 s35, v26, 3
-; SI-NEXT:    v_readlane_b32 s34, v26, 2
-; SI-NEXT:    v_readlane_b32 s31, v26, 1
-; SI-NEXT:    v_readlane_b32 s30, v26, 0
+; SI-NEXT:    v_readlane_b32 s31, v26, 35
+; SI-NEXT:    v_readlane_b32 s99, v26, 33
+; SI-NEXT:    v_readlane_b32 s98, v26, 32
+; SI-NEXT:    v_readlane_b32 s97, v26, 31
+; SI-NEXT:    v_readlane_b32 s96, v26, 30
+; SI-NEXT:    v_readlane_b32 s87, v26, 29
+; SI-NEXT:    v_readlane_b32 s86, v26, 28
+; SI-NEXT:    v_readlane_b32 s85, v26, 27
+; SI-NEXT:    v_readlane_b32 s84, v26, 26
+; SI-NEXT:    v_readlane_b32 s83, v26, 25
+; SI-NEXT:    v_readlane_b32 s82, v26, 24
+; SI-NEXT:    v_readlane_b32 s81, v26, 23
+; SI-NEXT:    v_readlane_b32 s80, v26, 22
+; SI-NEXT:    v_readlane_b32 s71, v26, 21
+; SI-NEXT:    v_readlane_b32 s70, v26, 20
+; SI-NEXT:    v_readlane_b32 s69, v26, 19
+; SI-NEXT:    v_readlane_b32 s68, v26, 18
+; SI-NEXT:    v_readlane_b32 s67, v26, 17
+; SI-NEXT:    v_readlane_b32 s66, v26, 16
+; SI-NEXT:    v_readlane_b32 s65, v26, 15
+; SI-NEXT:    v_readlane_b32 s64, v26, 14
+; SI-NEXT:    v_readlane_b32 s55, v26, 13
+; SI-NEXT:    v_readlane_b32 s54, v26, 12
+; SI-NEXT:    v_readlane_b32 s53, v26, 11
+; SI-NEXT:    v_readlane_b32 s52, v26, 10
+; SI-NEXT:    v_readlane_b32 s51, v26, 9
+; SI-NEXT:    v_readlane_b32 s50, v26, 8
+; SI-NEXT:    v_readlane_b32 s49, v26, 7
+; SI-NEXT:    v_readlane_b32 s48, v26, 6
+; SI-NEXT:    v_readlane_b32 s39, v26, 5
+; SI-NEXT:    v_readlane_b32 s38, v26, 4
+; SI-NEXT:    v_readlane_b32 s37, v26, 3
+; SI-NEXT:    v_readlane_b32 s36, v26, 2
+; SI-NEXT:    v_readlane_b32 s35, v26, 1
+; SI-NEXT:    v_readlane_b32 s34, v26, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -41559,7 +41559,7 @@ end:
   ret <52 x half> %phi
 }
 
-define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
+define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v52i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42433,7 +42433,7 @@ end:
   ret <52 x i16> %phi
 }
 
-define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43769,3 +43769,5 @@ end:
   %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <52 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index a24315e66392c..17370fc4b8480 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) {
+define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -188,7 +188,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -526,7 +526,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) {
+define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -694,7 +694,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1238,7 +1238,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) {
+define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1420,7 +1420,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1758,7 +1758,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) {
+define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1947,7 +1947,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2285,7 +2285,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) {
+define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2467,7 +2467,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2805,7 +2805,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) {
+define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2931,7 +2931,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3419,7 +3419,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
+define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4255,7 +4255,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4263,20 +4263,20 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
+; SI-NEXT:    v_writelane_b32 v28, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -4292,7 +4292,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s41, v1
 ; SI-NEXT:    s_cmp_lg_u32 s40, 0
 ; SI-NEXT:    v_readfirstlane_b32 s40, v0
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
+; SI-NEXT:    v_writelane_b32 v28, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s30, s5, 16
@@ -4466,6 +4466,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s42, s30, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s43
 ; SI-NEXT:    s_or_b32 s5, s5, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 12
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -4494,20 +4495,19 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v25, s7
 ; SI-NEXT:    v_mov_b32_e32 v26, s4
 ; SI-NEXT:    v_mov_b32_e32 v27, s5
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 13
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4550,10 +4550,10 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -4569,7 +4569,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s42, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s43, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -4743,6 +4743,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s42, s44, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s42
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -4771,10 +4772,9 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v25, s8
 ; VI-NEXT:    v_mov_b32_e32 v26, s7
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5204,7 +5204,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
+define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6449,7 +6449,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6457,37 +6457,38 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s30, 30
+; SI-NEXT:    v_writelane_b32 v28, s31, 31
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
 ; SI-NEXT:    v_readfirstlane_b32 s9, v12
 ; SI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -6502,7 +6503,6 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s31, v2
 ; SI-NEXT:    v_readfirstlane_b32 s69, v1
 ; SI-NEXT:    v_readfirstlane_b32 s80, v0
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -6762,6 +6762,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v28, 30
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -6790,38 +6791,37 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
 ; SI-NEXT:    v_mov_b32_e32 v26, s62
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 31
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -6838,38 +6838,37 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
-; VI-NEXT:    v_writelane_b32 v28, s36, 4
-; VI-NEXT:    v_writelane_b32 v28, s37, 5
-; VI-NEXT:    v_writelane_b32 v28, s38, 6
-; VI-NEXT:    v_writelane_b32 v28, s39, 7
-; VI-NEXT:    v_writelane_b32 v28, s48, 8
-; VI-NEXT:    v_writelane_b32 v28, s49, 9
-; VI-NEXT:    v_writelane_b32 v28, s50, 10
-; VI-NEXT:    v_writelane_b32 v28, s51, 11
-; VI-NEXT:    v_writelane_b32 v28, s52, 12
-; VI-NEXT:    v_writelane_b32 v28, s53, 13
-; VI-NEXT:    v_writelane_b32 v28, s54, 14
-; VI-NEXT:    v_writelane_b32 v28, s55, 15
-; VI-NEXT:    v_writelane_b32 v28, s64, 16
-; VI-NEXT:    v_writelane_b32 v28, s65, 17
-; VI-NEXT:    v_writelane_b32 v28, s66, 18
-; VI-NEXT:    v_writelane_b32 v28, s67, 19
-; VI-NEXT:    v_writelane_b32 v28, s68, 20
-; VI-NEXT:    v_writelane_b32 v28, s69, 21
-; VI-NEXT:    v_writelane_b32 v28, s70, 22
-; VI-NEXT:    v_writelane_b32 v28, s71, 23
-; VI-NEXT:    v_writelane_b32 v28, s80, 24
-; VI-NEXT:    v_writelane_b32 v28, s81, 25
-; VI-NEXT:    v_writelane_b32 v28, s82, 26
-; VI-NEXT:    v_writelane_b32 v28, s83, 27
-; VI-NEXT:    v_writelane_b32 v28, s84, 28
-; VI-NEXT:    v_writelane_b32 v28, s85, 29
-; VI-NEXT:    v_writelane_b32 v28, s86, 30
-; VI-NEXT:    v_writelane_b32 v28, s87, 31
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s36, 2
+; VI-NEXT:    v_writelane_b32 v28, s37, 3
+; VI-NEXT:    v_writelane_b32 v28, s38, 4
+; VI-NEXT:    v_writelane_b32 v28, s39, 5
+; VI-NEXT:    v_writelane_b32 v28, s48, 6
+; VI-NEXT:    v_writelane_b32 v28, s49, 7
+; VI-NEXT:    v_writelane_b32 v28, s50, 8
+; VI-NEXT:    v_writelane_b32 v28, s51, 9
+; VI-NEXT:    v_writelane_b32 v28, s52, 10
+; VI-NEXT:    v_writelane_b32 v28, s53, 11
+; VI-NEXT:    v_writelane_b32 v28, s54, 12
+; VI-NEXT:    v_writelane_b32 v28, s55, 13
+; VI-NEXT:    v_writelane_b32 v28, s64, 14
+; VI-NEXT:    v_writelane_b32 v28, s65, 15
+; VI-NEXT:    v_writelane_b32 v28, s66, 16
+; VI-NEXT:    v_writelane_b32 v28, s67, 17
+; VI-NEXT:    v_writelane_b32 v28, s68, 18
+; VI-NEXT:    v_writelane_b32 v28, s69, 19
+; VI-NEXT:    v_writelane_b32 v28, s70, 20
+; VI-NEXT:    v_writelane_b32 v28, s71, 21
+; VI-NEXT:    v_writelane_b32 v28, s80, 22
+; VI-NEXT:    v_writelane_b32 v28, s81, 23
+; VI-NEXT:    v_writelane_b32 v28, s82, 24
+; VI-NEXT:    v_writelane_b32 v28, s83, 25
+; VI-NEXT:    v_writelane_b32 v28, s84, 26
+; VI-NEXT:    v_writelane_b32 v28, s85, 27
+; VI-NEXT:    v_writelane_b32 v28, s86, 28
+; VI-NEXT:    v_writelane_b32 v28, s87, 29
+; VI-NEXT:    v_writelane_b32 v28, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s86, v13
 ; VI-NEXT:    v_readfirstlane_b32 s6, v12
 ; VI-NEXT:    v_readfirstlane_b32 s9, v11
@@ -6884,6 +6883,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s69, v2
 ; VI-NEXT:    v_readfirstlane_b32 s81, v1
 ; VI-NEXT:    v_readfirstlane_b32 s84, v0
+; VI-NEXT:    v_writelane_b32 v28, s31, 31
 ; VI-NEXT:    s_lshr_b32 s79, s29, 16
 ; VI-NEXT:    s_lshr_b32 s90, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -7143,6 +7143,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v28, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -7171,38 +7172,37 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
 ; VI-NEXT:    v_mov_b32_e32 v26, s62
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
-; VI-NEXT:    v_readlane_b32 s87, v28, 31
-; VI-NEXT:    v_readlane_b32 s86, v28, 30
-; VI-NEXT:    v_readlane_b32 s85, v28, 29
-; VI-NEXT:    v_readlane_b32 s84, v28, 28
-; VI-NEXT:    v_readlane_b32 s83, v28, 27
-; VI-NEXT:    v_readlane_b32 s82, v28, 26
-; VI-NEXT:    v_readlane_b32 s81, v28, 25
-; VI-NEXT:    v_readlane_b32 s80, v28, 24
-; VI-NEXT:    v_readlane_b32 s71, v28, 23
-; VI-NEXT:    v_readlane_b32 s70, v28, 22
-; VI-NEXT:    v_readlane_b32 s69, v28, 21
-; VI-NEXT:    v_readlane_b32 s68, v28, 20
-; VI-NEXT:    v_readlane_b32 s67, v28, 19
-; VI-NEXT:    v_readlane_b32 s66, v28, 18
-; VI-NEXT:    v_readlane_b32 s65, v28, 17
-; VI-NEXT:    v_readlane_b32 s64, v28, 16
-; VI-NEXT:    v_readlane_b32 s55, v28, 15
-; VI-NEXT:    v_readlane_b32 s54, v28, 14
-; VI-NEXT:    v_readlane_b32 s53, v28, 13
-; VI-NEXT:    v_readlane_b32 s52, v28, 12
-; VI-NEXT:    v_readlane_b32 s51, v28, 11
-; VI-NEXT:    v_readlane_b32 s50, v28, 10
-; VI-NEXT:    v_readlane_b32 s49, v28, 9
-; VI-NEXT:    v_readlane_b32 s48, v28, 8
-; VI-NEXT:    v_readlane_b32 s39, v28, 7
-; VI-NEXT:    v_readlane_b32 s38, v28, 6
-; VI-NEXT:    v_readlane_b32 s37, v28, 5
-; VI-NEXT:    v_readlane_b32 s36, v28, 4
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 31
+; VI-NEXT:    v_readlane_b32 s87, v28, 29
+; VI-NEXT:    v_readlane_b32 s86, v28, 28
+; VI-NEXT:    v_readlane_b32 s85, v28, 27
+; VI-NEXT:    v_readlane_b32 s84, v28, 26
+; VI-NEXT:    v_readlane_b32 s83, v28, 25
+; VI-NEXT:    v_readlane_b32 s82, v28, 24
+; VI-NEXT:    v_readlane_b32 s81, v28, 23
+; VI-NEXT:    v_readlane_b32 s80, v28, 22
+; VI-NEXT:    v_readlane_b32 s71, v28, 21
+; VI-NEXT:    v_readlane_b32 s70, v28, 20
+; VI-NEXT:    v_readlane_b32 s69, v28, 19
+; VI-NEXT:    v_readlane_b32 s68, v28, 18
+; VI-NEXT:    v_readlane_b32 s67, v28, 17
+; VI-NEXT:    v_readlane_b32 s66, v28, 16
+; VI-NEXT:    v_readlane_b32 s65, v28, 15
+; VI-NEXT:    v_readlane_b32 s64, v28, 14
+; VI-NEXT:    v_readlane_b32 s55, v28, 13
+; VI-NEXT:    v_readlane_b32 s54, v28, 12
+; VI-NEXT:    v_readlane_b32 s53, v28, 11
+; VI-NEXT:    v_readlane_b32 s52, v28, 10
+; VI-NEXT:    v_readlane_b32 s51, v28, 9
+; VI-NEXT:    v_readlane_b32 s50, v28, 8
+; VI-NEXT:    v_readlane_b32 s49, v28, 7
+; VI-NEXT:    v_readlane_b32 s48, v28, 6
+; VI-NEXT:    v_readlane_b32 s39, v28, 5
+; VI-NEXT:    v_readlane_b32 s38, v28, 4
+; VI-NEXT:    v_readlane_b32 s37, v28, 3
+; VI-NEXT:    v_readlane_b32 s36, v28, 2
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -7620,7 +7620,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
+define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8456,7 +8456,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28i32_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8464,20 +8464,20 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
+; SI-NEXT:    v_writelane_b32 v28, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -8493,7 +8493,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s41, v1
 ; SI-NEXT:    s_cmp_lg_u32 s40, 0
 ; SI-NEXT:    v_readfirstlane_b32 s40, v0
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
+; SI-NEXT:    v_writelane_b32 v28, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s30, s5, 16
@@ -8667,6 +8667,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s42, s30, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s43
 ; SI-NEXT:    s_or_b32 s5, s5, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 12
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -8695,20 +8696,19 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v25, s7
 ; SI-NEXT:    v_mov_b32_e32 v26, s4
 ; SI-NEXT:    v_mov_b32_e32 v27, s5
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 13
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -8751,10 +8751,10 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -8770,7 +8770,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s42, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s43, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -8944,6 +8944,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s42, s44, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s42
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -8972,10 +8973,9 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v25, s8
 ; VI-NEXT:    v_mov_b32_e32 v26, s7
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9405,7 +9405,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
+define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10787,7 +10787,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v28i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10795,37 +10795,37 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s6, v13
 ; SI-NEXT:    v_readfirstlane_b32 s8, v12
 ; SI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -10840,7 +10840,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s92, v2
 ; SI-NEXT:    v_readfirstlane_b32 s95, v1
 ; SI-NEXT:    v_readfirstlane_b32 s34, v0
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
+; SI-NEXT:    v_writelane_b32 v32, s31, 31
 ; SI-NEXT:    s_lshr_b32 s94, s29, 16
 ; SI-NEXT:    s_lshr_b32 s30, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -11221,38 +11221,38 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 30
+; SI-NEXT:    v_readlane_b32 s31, v32, 31
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -11266,41 +11266,41 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s15, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    s_lshr_b32 s61, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v10
@@ -11315,7 +11315,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s80, v1
 ; VI-NEXT:    v_readfirstlane_b32 s83, v0
 ; VI-NEXT:    v_writelane_b32 v33, s15, 0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s90, s27, 16
@@ -11602,38 +11602,38 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -11962,7 +11962,7 @@ end:
   ret <28 x i32> %phi
 }
 
-define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) {
+define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12130,7 +12130,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12674,7 +12674,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) {
+define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12863,7 +12863,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13201,7 +13201,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) {
+define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13369,7 +13369,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13913,7 +13913,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) {
+define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14039,7 +14039,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14527,7 +14527,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
+define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15335,7 +15335,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15343,20 +15343,20 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v58, s30, 0
-; SI-NEXT:    v_writelane_b32 v58, s31, 1
-; SI-NEXT:    v_writelane_b32 v58, s34, 2
-; SI-NEXT:    v_writelane_b32 v58, s35, 3
-; SI-NEXT:    v_writelane_b32 v58, s36, 4
-; SI-NEXT:    v_writelane_b32 v58, s37, 5
-; SI-NEXT:    v_writelane_b32 v58, s38, 6
-; SI-NEXT:    v_writelane_b32 v58, s39, 7
-; SI-NEXT:    v_writelane_b32 v58, s48, 8
-; SI-NEXT:    v_writelane_b32 v58, s49, 9
-; SI-NEXT:    v_writelane_b32 v58, s50, 10
-; SI-NEXT:    v_writelane_b32 v58, s51, 11
+; SI-NEXT:    v_writelane_b32 v58, s34, 0
+; SI-NEXT:    v_writelane_b32 v58, s35, 1
+; SI-NEXT:    v_writelane_b32 v58, s36, 2
+; SI-NEXT:    v_writelane_b32 v58, s37, 3
+; SI-NEXT:    v_writelane_b32 v58, s38, 4
+; SI-NEXT:    v_writelane_b32 v58, s39, 5
+; SI-NEXT:    v_writelane_b32 v58, s48, 6
+; SI-NEXT:    v_writelane_b32 v58, s49, 7
+; SI-NEXT:    v_writelane_b32 v58, s50, 8
+; SI-NEXT:    v_writelane_b32 v58, s51, 9
+; SI-NEXT:    v_writelane_b32 v58, s52, 10
+; SI-NEXT:    v_writelane_b32 v58, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v58, s52, 12
+; SI-NEXT:    v_writelane_b32 v58, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -15382,7 +15382,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v58, s53, 13
+; SI-NEXT:    v_writelane_b32 v58, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s53, s5, 16
@@ -15650,6 +15650,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_readlane_b32 s30, v58, 12
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v39
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v38
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v37
@@ -15662,20 +15663,19 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_readlane_b32 s53, v58, 13
-; SI-NEXT:    v_readlane_b32 s52, v58, 12
-; SI-NEXT:    v_readlane_b32 s51, v58, 11
-; SI-NEXT:    v_readlane_b32 s50, v58, 10
-; SI-NEXT:    v_readlane_b32 s49, v58, 9
-; SI-NEXT:    v_readlane_b32 s48, v58, 8
-; SI-NEXT:    v_readlane_b32 s39, v58, 7
-; SI-NEXT:    v_readlane_b32 s38, v58, 6
-; SI-NEXT:    v_readlane_b32 s37, v58, 5
-; SI-NEXT:    v_readlane_b32 s36, v58, 4
-; SI-NEXT:    v_readlane_b32 s35, v58, 3
-; SI-NEXT:    v_readlane_b32 s34, v58, 2
-; SI-NEXT:    v_readlane_b32 s31, v58, 1
-; SI-NEXT:    v_readlane_b32 s30, v58, 0
+; SI-NEXT:    v_readlane_b32 s31, v58, 13
+; SI-NEXT:    v_readlane_b32 s53, v58, 11
+; SI-NEXT:    v_readlane_b32 s52, v58, 10
+; SI-NEXT:    v_readlane_b32 s51, v58, 9
+; SI-NEXT:    v_readlane_b32 s50, v58, 8
+; SI-NEXT:    v_readlane_b32 s49, v58, 7
+; SI-NEXT:    v_readlane_b32 s48, v58, 6
+; SI-NEXT:    v_readlane_b32 s39, v58, 5
+; SI-NEXT:    v_readlane_b32 s38, v58, 4
+; SI-NEXT:    v_readlane_b32 s37, v58, 3
+; SI-NEXT:    v_readlane_b32 s36, v58, 2
+; SI-NEXT:    v_readlane_b32 s35, v58, 1
+; SI-NEXT:    v_readlane_b32 s34, v58, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -15688,10 +15688,10 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -15715,7 +15715,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -15937,6 +15937,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -15957,10 +15958,9 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16675,7 +16675,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
+define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17920,7 +17920,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17928,37 +17928,38 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s30, 30
+; SI-NEXT:    v_writelane_b32 v28, s31, 31
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
 ; SI-NEXT:    v_readfirstlane_b32 s9, v12
 ; SI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -17973,7 +17974,6 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s31, v2
 ; SI-NEXT:    v_readfirstlane_b32 s69, v1
 ; SI-NEXT:    v_readfirstlane_b32 s80, v0
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -18233,6 +18233,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v28, 30
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -18261,38 +18262,37 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
 ; SI-NEXT:    v_mov_b32_e32 v26, s62
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 31
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18309,38 +18309,37 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
-; VI-NEXT:    v_writelane_b32 v28, s36, 4
-; VI-NEXT:    v_writelane_b32 v28, s37, 5
-; VI-NEXT:    v_writelane_b32 v28, s38, 6
-; VI-NEXT:    v_writelane_b32 v28, s39, 7
-; VI-NEXT:    v_writelane_b32 v28, s48, 8
-; VI-NEXT:    v_writelane_b32 v28, s49, 9
-; VI-NEXT:    v_writelane_b32 v28, s50, 10
-; VI-NEXT:    v_writelane_b32 v28, s51, 11
-; VI-NEXT:    v_writelane_b32 v28, s52, 12
-; VI-NEXT:    v_writelane_b32 v28, s53, 13
-; VI-NEXT:    v_writelane_b32 v28, s54, 14
-; VI-NEXT:    v_writelane_b32 v28, s55, 15
-; VI-NEXT:    v_writelane_b32 v28, s64, 16
-; VI-NEXT:    v_writelane_b32 v28, s65, 17
-; VI-NEXT:    v_writelane_b32 v28, s66, 18
-; VI-NEXT:    v_writelane_b32 v28, s67, 19
-; VI-NEXT:    v_writelane_b32 v28, s68, 20
-; VI-NEXT:    v_writelane_b32 v28, s69, 21
-; VI-NEXT:    v_writelane_b32 v28, s70, 22
-; VI-NEXT:    v_writelane_b32 v28, s71, 23
-; VI-NEXT:    v_writelane_b32 v28, s80, 24
-; VI-NEXT:    v_writelane_b32 v28, s81, 25
-; VI-NEXT:    v_writelane_b32 v28, s82, 26
-; VI-NEXT:    v_writelane_b32 v28, s83, 27
-; VI-NEXT:    v_writelane_b32 v28, s84, 28
-; VI-NEXT:    v_writelane_b32 v28, s85, 29
-; VI-NEXT:    v_writelane_b32 v28, s86, 30
-; VI-NEXT:    v_writelane_b32 v28, s87, 31
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s36, 2
+; VI-NEXT:    v_writelane_b32 v28, s37, 3
+; VI-NEXT:    v_writelane_b32 v28, s38, 4
+; VI-NEXT:    v_writelane_b32 v28, s39, 5
+; VI-NEXT:    v_writelane_b32 v28, s48, 6
+; VI-NEXT:    v_writelane_b32 v28, s49, 7
+; VI-NEXT:    v_writelane_b32 v28, s50, 8
+; VI-NEXT:    v_writelane_b32 v28, s51, 9
+; VI-NEXT:    v_writelane_b32 v28, s52, 10
+; VI-NEXT:    v_writelane_b32 v28, s53, 11
+; VI-NEXT:    v_writelane_b32 v28, s54, 12
+; VI-NEXT:    v_writelane_b32 v28, s55, 13
+; VI-NEXT:    v_writelane_b32 v28, s64, 14
+; VI-NEXT:    v_writelane_b32 v28, s65, 15
+; VI-NEXT:    v_writelane_b32 v28, s66, 16
+; VI-NEXT:    v_writelane_b32 v28, s67, 17
+; VI-NEXT:    v_writelane_b32 v28, s68, 18
+; VI-NEXT:    v_writelane_b32 v28, s69, 19
+; VI-NEXT:    v_writelane_b32 v28, s70, 20
+; VI-NEXT:    v_writelane_b32 v28, s71, 21
+; VI-NEXT:    v_writelane_b32 v28, s80, 22
+; VI-NEXT:    v_writelane_b32 v28, s81, 23
+; VI-NEXT:    v_writelane_b32 v28, s82, 24
+; VI-NEXT:    v_writelane_b32 v28, s83, 25
+; VI-NEXT:    v_writelane_b32 v28, s84, 26
+; VI-NEXT:    v_writelane_b32 v28, s85, 27
+; VI-NEXT:    v_writelane_b32 v28, s86, 28
+; VI-NEXT:    v_writelane_b32 v28, s87, 29
+; VI-NEXT:    v_writelane_b32 v28, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s86, v13
 ; VI-NEXT:    v_readfirstlane_b32 s6, v12
 ; VI-NEXT:    v_readfirstlane_b32 s9, v11
@@ -18355,6 +18354,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s69, v2
 ; VI-NEXT:    v_readfirstlane_b32 s81, v1
 ; VI-NEXT:    v_readfirstlane_b32 s84, v0
+; VI-NEXT:    v_writelane_b32 v28, s31, 31
 ; VI-NEXT:    s_lshr_b32 s79, s29, 16
 ; VI-NEXT:    s_lshr_b32 s90, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -18614,6 +18614,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v28, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -18642,38 +18643,37 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
 ; VI-NEXT:    v_mov_b32_e32 v26, s62
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
-; VI-NEXT:    v_readlane_b32 s87, v28, 31
-; VI-NEXT:    v_readlane_b32 s86, v28, 30
-; VI-NEXT:    v_readlane_b32 s85, v28, 29
-; VI-NEXT:    v_readlane_b32 s84, v28, 28
-; VI-NEXT:    v_readlane_b32 s83, v28, 27
-; VI-NEXT:    v_readlane_b32 s82, v28, 26
-; VI-NEXT:    v_readlane_b32 s81, v28, 25
-; VI-NEXT:    v_readlane_b32 s80, v28, 24
-; VI-NEXT:    v_readlane_b32 s71, v28, 23
-; VI-NEXT:    v_readlane_b32 s70, v28, 22
-; VI-NEXT:    v_readlane_b32 s69, v28, 21
-; VI-NEXT:    v_readlane_b32 s68, v28, 20
-; VI-NEXT:    v_readlane_b32 s67, v28, 19
-; VI-NEXT:    v_readlane_b32 s66, v28, 18
-; VI-NEXT:    v_readlane_b32 s65, v28, 17
-; VI-NEXT:    v_readlane_b32 s64, v28, 16
-; VI-NEXT:    v_readlane_b32 s55, v28, 15
-; VI-NEXT:    v_readlane_b32 s54, v28, 14
-; VI-NEXT:    v_readlane_b32 s53, v28, 13
-; VI-NEXT:    v_readlane_b32 s52, v28, 12
-; VI-NEXT:    v_readlane_b32 s51, v28, 11
-; VI-NEXT:    v_readlane_b32 s50, v28, 10
-; VI-NEXT:    v_readlane_b32 s49, v28, 9
-; VI-NEXT:    v_readlane_b32 s48, v28, 8
-; VI-NEXT:    v_readlane_b32 s39, v28, 7
-; VI-NEXT:    v_readlane_b32 s38, v28, 6
-; VI-NEXT:    v_readlane_b32 s37, v28, 5
-; VI-NEXT:    v_readlane_b32 s36, v28, 4
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 31
+; VI-NEXT:    v_readlane_b32 s87, v28, 29
+; VI-NEXT:    v_readlane_b32 s86, v28, 28
+; VI-NEXT:    v_readlane_b32 s85, v28, 27
+; VI-NEXT:    v_readlane_b32 s84, v28, 26
+; VI-NEXT:    v_readlane_b32 s83, v28, 25
+; VI-NEXT:    v_readlane_b32 s82, v28, 24
+; VI-NEXT:    v_readlane_b32 s81, v28, 23
+; VI-NEXT:    v_readlane_b32 s80, v28, 22
+; VI-NEXT:    v_readlane_b32 s71, v28, 21
+; VI-NEXT:    v_readlane_b32 s70, v28, 20
+; VI-NEXT:    v_readlane_b32 s69, v28, 19
+; VI-NEXT:    v_readlane_b32 s68, v28, 18
+; VI-NEXT:    v_readlane_b32 s67, v28, 17
+; VI-NEXT:    v_readlane_b32 s66, v28, 16
+; VI-NEXT:    v_readlane_b32 s65, v28, 15
+; VI-NEXT:    v_readlane_b32 s64, v28, 14
+; VI-NEXT:    v_readlane_b32 s55, v28, 13
+; VI-NEXT:    v_readlane_b32 s54, v28, 12
+; VI-NEXT:    v_readlane_b32 s53, v28, 11
+; VI-NEXT:    v_readlane_b32 s52, v28, 10
+; VI-NEXT:    v_readlane_b32 s51, v28, 9
+; VI-NEXT:    v_readlane_b32 s50, v28, 8
+; VI-NEXT:    v_readlane_b32 s49, v28, 7
+; VI-NEXT:    v_readlane_b32 s48, v28, 6
+; VI-NEXT:    v_readlane_b32 s39, v28, 5
+; VI-NEXT:    v_readlane_b32 s38, v28, 4
+; VI-NEXT:    v_readlane_b32 s37, v28, 3
+; VI-NEXT:    v_readlane_b32 s36, v28, 2
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -19091,7 +19091,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
+define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19899,7 +19899,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v28f32_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19907,20 +19907,20 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v58, s30, 0
-; SI-NEXT:    v_writelane_b32 v58, s31, 1
-; SI-NEXT:    v_writelane_b32 v58, s34, 2
-; SI-NEXT:    v_writelane_b32 v58, s35, 3
-; SI-NEXT:    v_writelane_b32 v58, s36, 4
-; SI-NEXT:    v_writelane_b32 v58, s37, 5
-; SI-NEXT:    v_writelane_b32 v58, s38, 6
-; SI-NEXT:    v_writelane_b32 v58, s39, 7
-; SI-NEXT:    v_writelane_b32 v58, s48, 8
-; SI-NEXT:    v_writelane_b32 v58, s49, 9
-; SI-NEXT:    v_writelane_b32 v58, s50, 10
-; SI-NEXT:    v_writelane_b32 v58, s51, 11
+; SI-NEXT:    v_writelane_b32 v58, s34, 0
+; SI-NEXT:    v_writelane_b32 v58, s35, 1
+; SI-NEXT:    v_writelane_b32 v58, s36, 2
+; SI-NEXT:    v_writelane_b32 v58, s37, 3
+; SI-NEXT:    v_writelane_b32 v58, s38, 4
+; SI-NEXT:    v_writelane_b32 v58, s39, 5
+; SI-NEXT:    v_writelane_b32 v58, s48, 6
+; SI-NEXT:    v_writelane_b32 v58, s49, 7
+; SI-NEXT:    v_writelane_b32 v58, s50, 8
+; SI-NEXT:    v_writelane_b32 v58, s51, 9
+; SI-NEXT:    v_writelane_b32 v58, s52, 10
+; SI-NEXT:    v_writelane_b32 v58, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v58, s52, 12
+; SI-NEXT:    v_writelane_b32 v58, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -19946,7 +19946,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v58, s53, 13
+; SI-NEXT:    v_writelane_b32 v58, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s53, s5, 16
@@ -20214,6 +20214,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_readlane_b32 s30, v58, 12
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v39
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v38
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v37
@@ -20226,20 +20227,19 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_readlane_b32 s53, v58, 13
-; SI-NEXT:    v_readlane_b32 s52, v58, 12
-; SI-NEXT:    v_readlane_b32 s51, v58, 11
-; SI-NEXT:    v_readlane_b32 s50, v58, 10
-; SI-NEXT:    v_readlane_b32 s49, v58, 9
-; SI-NEXT:    v_readlane_b32 s48, v58, 8
-; SI-NEXT:    v_readlane_b32 s39, v58, 7
-; SI-NEXT:    v_readlane_b32 s38, v58, 6
-; SI-NEXT:    v_readlane_b32 s37, v58, 5
-; SI-NEXT:    v_readlane_b32 s36, v58, 4
-; SI-NEXT:    v_readlane_b32 s35, v58, 3
-; SI-NEXT:    v_readlane_b32 s34, v58, 2
-; SI-NEXT:    v_readlane_b32 s31, v58, 1
-; SI-NEXT:    v_readlane_b32 s30, v58, 0
+; SI-NEXT:    v_readlane_b32 s31, v58, 13
+; SI-NEXT:    v_readlane_b32 s53, v58, 11
+; SI-NEXT:    v_readlane_b32 s52, v58, 10
+; SI-NEXT:    v_readlane_b32 s51, v58, 9
+; SI-NEXT:    v_readlane_b32 s50, v58, 8
+; SI-NEXT:    v_readlane_b32 s49, v58, 7
+; SI-NEXT:    v_readlane_b32 s48, v58, 6
+; SI-NEXT:    v_readlane_b32 s39, v58, 5
+; SI-NEXT:    v_readlane_b32 s38, v58, 4
+; SI-NEXT:    v_readlane_b32 s37, v58, 3
+; SI-NEXT:    v_readlane_b32 s36, v58, 2
+; SI-NEXT:    v_readlane_b32 s35, v58, 1
+; SI-NEXT:    v_readlane_b32 s34, v58, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20252,10 +20252,10 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -20279,7 +20279,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -20501,6 +20501,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20521,10 +20522,9 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21239,7 +21239,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
+define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22621,7 +22621,7 @@ end:
   ret <28 x float> %phi
 }
 
-define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v28f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22629,37 +22629,37 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s6, v13
 ; SI-NEXT:    v_readfirstlane_b32 s8, v12
 ; SI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -22674,7 +22674,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s92, v2
 ; SI-NEXT:    v_readfirstlane_b32 s95, v1
 ; SI-NEXT:    v_readfirstlane_b32 s34, v0
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
+; SI-NEXT:    v_writelane_b32 v32, s31, 31
 ; SI-NEXT:    s_lshr_b32 s94, s29, 16
 ; SI-NEXT:    s_lshr_b32 s30, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -23055,38 +23055,38 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 30
+; SI-NEXT:    v_readlane_b32 s31, v32, 31
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23100,41 +23100,41 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s15, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    s_lshr_b32 s61, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v10
@@ -23149,7 +23149,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s80, v1
 ; VI-NEXT:    v_readfirstlane_b32 s83, v0
 ; VI-NEXT:    v_writelane_b32 v33, s15, 0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s90, s27, 16
@@ -23436,38 +23436,38 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -23796,7 +23796,7 @@ end:
   ret <28 x float> %phi
 }
 
-define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) {
+define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23985,7 +23985,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24322,7 +24322,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) {
+define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24448,7 +24448,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24936,7 +24936,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
+define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25786,7 +25786,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25794,20 +25794,20 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
+; SI-NEXT:    v_writelane_b32 v28, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -25823,7 +25823,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s41, v1
 ; SI-NEXT:    s_cmp_lg_u32 s40, 0
 ; SI-NEXT:    v_readfirstlane_b32 s40, v0
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
+; SI-NEXT:    v_writelane_b32 v28, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s30, s5, 16
@@ -25997,6 +25997,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s42, s30, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s43
 ; SI-NEXT:    s_or_b32 s5, s5, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 12
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -26025,20 +26026,19 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v25, s7
 ; SI-NEXT:    v_mov_b32_e32 v26, s4
 ; SI-NEXT:    v_mov_b32_e32 v27, s5
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 13
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26081,10 +26081,10 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -26100,7 +26100,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s42, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s43, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -26274,6 +26274,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s42, s44, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s42
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -26302,10 +26303,9 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v25, s8
 ; VI-NEXT:    v_mov_b32_e32 v26, s7
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26735,7 +26735,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
+define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27980,7 +27980,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27988,37 +27988,38 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s30, 30
+; SI-NEXT:    v_writelane_b32 v28, s31, 31
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
 ; SI-NEXT:    v_readfirstlane_b32 s9, v12
 ; SI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -28033,7 +28034,6 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s31, v2
 ; SI-NEXT:    v_readfirstlane_b32 s69, v1
 ; SI-NEXT:    v_readfirstlane_b32 s80, v0
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -28293,6 +28293,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v28, 30
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -28321,38 +28322,37 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
 ; SI-NEXT:    v_mov_b32_e32 v26, s62
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 31
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28369,38 +28369,37 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
-; VI-NEXT:    v_writelane_b32 v28, s36, 4
-; VI-NEXT:    v_writelane_b32 v28, s37, 5
-; VI-NEXT:    v_writelane_b32 v28, s38, 6
-; VI-NEXT:    v_writelane_b32 v28, s39, 7
-; VI-NEXT:    v_writelane_b32 v28, s48, 8
-; VI-NEXT:    v_writelane_b32 v28, s49, 9
-; VI-NEXT:    v_writelane_b32 v28, s50, 10
-; VI-NEXT:    v_writelane_b32 v28, s51, 11
-; VI-NEXT:    v_writelane_b32 v28, s52, 12
-; VI-NEXT:    v_writelane_b32 v28, s53, 13
-; VI-NEXT:    v_writelane_b32 v28, s54, 14
-; VI-NEXT:    v_writelane_b32 v28, s55, 15
-; VI-NEXT:    v_writelane_b32 v28, s64, 16
-; VI-NEXT:    v_writelane_b32 v28, s65, 17
-; VI-NEXT:    v_writelane_b32 v28, s66, 18
-; VI-NEXT:    v_writelane_b32 v28, s67, 19
-; VI-NEXT:    v_writelane_b32 v28, s68, 20
-; VI-NEXT:    v_writelane_b32 v28, s69, 21
-; VI-NEXT:    v_writelane_b32 v28, s70, 22
-; VI-NEXT:    v_writelane_b32 v28, s71, 23
-; VI-NEXT:    v_writelane_b32 v28, s80, 24
-; VI-NEXT:    v_writelane_b32 v28, s81, 25
-; VI-NEXT:    v_writelane_b32 v28, s82, 26
-; VI-NEXT:    v_writelane_b32 v28, s83, 27
-; VI-NEXT:    v_writelane_b32 v28, s84, 28
-; VI-NEXT:    v_writelane_b32 v28, s85, 29
-; VI-NEXT:    v_writelane_b32 v28, s86, 30
-; VI-NEXT:    v_writelane_b32 v28, s87, 31
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s36, 2
+; VI-NEXT:    v_writelane_b32 v28, s37, 3
+; VI-NEXT:    v_writelane_b32 v28, s38, 4
+; VI-NEXT:    v_writelane_b32 v28, s39, 5
+; VI-NEXT:    v_writelane_b32 v28, s48, 6
+; VI-NEXT:    v_writelane_b32 v28, s49, 7
+; VI-NEXT:    v_writelane_b32 v28, s50, 8
+; VI-NEXT:    v_writelane_b32 v28, s51, 9
+; VI-NEXT:    v_writelane_b32 v28, s52, 10
+; VI-NEXT:    v_writelane_b32 v28, s53, 11
+; VI-NEXT:    v_writelane_b32 v28, s54, 12
+; VI-NEXT:    v_writelane_b32 v28, s55, 13
+; VI-NEXT:    v_writelane_b32 v28, s64, 14
+; VI-NEXT:    v_writelane_b32 v28, s65, 15
+; VI-NEXT:    v_writelane_b32 v28, s66, 16
+; VI-NEXT:    v_writelane_b32 v28, s67, 17
+; VI-NEXT:    v_writelane_b32 v28, s68, 18
+; VI-NEXT:    v_writelane_b32 v28, s69, 19
+; VI-NEXT:    v_writelane_b32 v28, s70, 20
+; VI-NEXT:    v_writelane_b32 v28, s71, 21
+; VI-NEXT:    v_writelane_b32 v28, s80, 22
+; VI-NEXT:    v_writelane_b32 v28, s81, 23
+; VI-NEXT:    v_writelane_b32 v28, s82, 24
+; VI-NEXT:    v_writelane_b32 v28, s83, 25
+; VI-NEXT:    v_writelane_b32 v28, s84, 26
+; VI-NEXT:    v_writelane_b32 v28, s85, 27
+; VI-NEXT:    v_writelane_b32 v28, s86, 28
+; VI-NEXT:    v_writelane_b32 v28, s87, 29
+; VI-NEXT:    v_writelane_b32 v28, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s86, v13
 ; VI-NEXT:    v_readfirstlane_b32 s6, v12
 ; VI-NEXT:    v_readfirstlane_b32 s9, v11
@@ -28415,6 +28414,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s69, v2
 ; VI-NEXT:    v_readfirstlane_b32 s81, v1
 ; VI-NEXT:    v_readfirstlane_b32 s84, v0
+; VI-NEXT:    v_writelane_b32 v28, s31, 31
 ; VI-NEXT:    s_lshr_b32 s79, s29, 16
 ; VI-NEXT:    s_lshr_b32 s90, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -28674,6 +28674,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v28, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -28702,38 +28703,37 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
 ; VI-NEXT:    v_mov_b32_e32 v26, s62
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
-; VI-NEXT:    v_readlane_b32 s87, v28, 31
-; VI-NEXT:    v_readlane_b32 s86, v28, 30
-; VI-NEXT:    v_readlane_b32 s85, v28, 29
-; VI-NEXT:    v_readlane_b32 s84, v28, 28
-; VI-NEXT:    v_readlane_b32 s83, v28, 27
-; VI-NEXT:    v_readlane_b32 s82, v28, 26
-; VI-NEXT:    v_readlane_b32 s81, v28, 25
-; VI-NEXT:    v_readlane_b32 s80, v28, 24
-; VI-NEXT:    v_readlane_b32 s71, v28, 23
-; VI-NEXT:    v_readlane_b32 s70, v28, 22
-; VI-NEXT:    v_readlane_b32 s69, v28, 21
-; VI-NEXT:    v_readlane_b32 s68, v28, 20
-; VI-NEXT:    v_readlane_b32 s67, v28, 19
-; VI-NEXT:    v_readlane_b32 s66, v28, 18
-; VI-NEXT:    v_readlane_b32 s65, v28, 17
-; VI-NEXT:    v_readlane_b32 s64, v28, 16
-; VI-NEXT:    v_readlane_b32 s55, v28, 15
-; VI-NEXT:    v_readlane_b32 s54, v28, 14
-; VI-NEXT:    v_readlane_b32 s53, v28, 13
-; VI-NEXT:    v_readlane_b32 s52, v28, 12
-; VI-NEXT:    v_readlane_b32 s51, v28, 11
-; VI-NEXT:    v_readlane_b32 s50, v28, 10
-; VI-NEXT:    v_readlane_b32 s49, v28, 9
-; VI-NEXT:    v_readlane_b32 s48, v28, 8
-; VI-NEXT:    v_readlane_b32 s39, v28, 7
-; VI-NEXT:    v_readlane_b32 s38, v28, 6
-; VI-NEXT:    v_readlane_b32 s37, v28, 5
-; VI-NEXT:    v_readlane_b32 s36, v28, 4
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 31
+; VI-NEXT:    v_readlane_b32 s87, v28, 29
+; VI-NEXT:    v_readlane_b32 s86, v28, 28
+; VI-NEXT:    v_readlane_b32 s85, v28, 27
+; VI-NEXT:    v_readlane_b32 s84, v28, 26
+; VI-NEXT:    v_readlane_b32 s83, v28, 25
+; VI-NEXT:    v_readlane_b32 s82, v28, 24
+; VI-NEXT:    v_readlane_b32 s81, v28, 23
+; VI-NEXT:    v_readlane_b32 s80, v28, 22
+; VI-NEXT:    v_readlane_b32 s71, v28, 21
+; VI-NEXT:    v_readlane_b32 s70, v28, 20
+; VI-NEXT:    v_readlane_b32 s69, v28, 19
+; VI-NEXT:    v_readlane_b32 s68, v28, 18
+; VI-NEXT:    v_readlane_b32 s67, v28, 17
+; VI-NEXT:    v_readlane_b32 s66, v28, 16
+; VI-NEXT:    v_readlane_b32 s65, v28, 15
+; VI-NEXT:    v_readlane_b32 s64, v28, 14
+; VI-NEXT:    v_readlane_b32 s55, v28, 13
+; VI-NEXT:    v_readlane_b32 s54, v28, 12
+; VI-NEXT:    v_readlane_b32 s53, v28, 11
+; VI-NEXT:    v_readlane_b32 s52, v28, 10
+; VI-NEXT:    v_readlane_b32 s51, v28, 9
+; VI-NEXT:    v_readlane_b32 s50, v28, 8
+; VI-NEXT:    v_readlane_b32 s49, v28, 7
+; VI-NEXT:    v_readlane_b32 s48, v28, 6
+; VI-NEXT:    v_readlane_b32 s39, v28, 5
+; VI-NEXT:    v_readlane_b32 s38, v28, 4
+; VI-NEXT:    v_readlane_b32 s37, v28, 3
+; VI-NEXT:    v_readlane_b32 s36, v28, 2
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -29151,7 +29151,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
+define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30001,7 +30001,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14i64_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30009,20 +30009,20 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s40, v14
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
+; SI-NEXT:    v_writelane_b32 v28, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s5, v13
 ; SI-NEXT:    v_readfirstlane_b32 s4, v12
 ; SI-NEXT:    v_readfirstlane_b32 s7, v11
@@ -30038,7 +30038,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s41, v1
 ; SI-NEXT:    s_cmp_lg_u32 s40, 0
 ; SI-NEXT:    v_readfirstlane_b32 s40, v0
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
+; SI-NEXT:    v_writelane_b32 v28, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s30, s5, 16
@@ -30212,6 +30212,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s42, s30, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s43
 ; SI-NEXT:    s_or_b32 s5, s5, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 12
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -30240,20 +30241,19 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v25, s7
 ; SI-NEXT:    v_mov_b32_e32 v26, s4
 ; SI-NEXT:    v_mov_b32_e32 v27, s5
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 13
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30296,10 +30296,10 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    v_readfirstlane_b32 s7, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v11
@@ -30315,7 +30315,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s42, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s43, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s6, 16
@@ -30489,6 +30489,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s42, s44, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s42
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -30517,10 +30518,9 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v25, s8
 ; VI-NEXT:    v_mov_b32_e32 v26, s7
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30950,7 +30950,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
+define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32332,7 +32332,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v14i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32340,37 +32340,37 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s6, v13
 ; SI-NEXT:    v_readfirstlane_b32 s8, v12
 ; SI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -32385,7 +32385,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s92, v2
 ; SI-NEXT:    v_readfirstlane_b32 s95, v1
 ; SI-NEXT:    v_readfirstlane_b32 s34, v0
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
+; SI-NEXT:    v_writelane_b32 v32, s31, 31
 ; SI-NEXT:    s_lshr_b32 s94, s29, 16
 ; SI-NEXT:    s_lshr_b32 s30, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -32766,38 +32766,38 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 30
+; SI-NEXT:    v_readlane_b32 s31, v32, 31
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32811,41 +32811,41 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s15, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    s_lshr_b32 s61, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v10
@@ -32860,7 +32860,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s80, v1
 ; VI-NEXT:    v_readfirstlane_b32 s83, v0
 ; VI-NEXT:    v_writelane_b32 v33, s15, 0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s90, s27, 16
@@ -33147,38 +33147,38 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -33507,7 +33507,7 @@ end:
   ret <14 x i64> %phi
 }
 
-define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
+define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34273,7 +34273,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34281,20 +34281,20 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v58, s30, 0
-; SI-NEXT:    v_writelane_b32 v58, s31, 1
-; SI-NEXT:    v_writelane_b32 v58, s34, 2
-; SI-NEXT:    v_writelane_b32 v58, s35, 3
-; SI-NEXT:    v_writelane_b32 v58, s36, 4
-; SI-NEXT:    v_writelane_b32 v58, s37, 5
-; SI-NEXT:    v_writelane_b32 v58, s38, 6
-; SI-NEXT:    v_writelane_b32 v58, s39, 7
-; SI-NEXT:    v_writelane_b32 v58, s48, 8
-; SI-NEXT:    v_writelane_b32 v58, s49, 9
-; SI-NEXT:    v_writelane_b32 v58, s50, 10
-; SI-NEXT:    v_writelane_b32 v58, s51, 11
+; SI-NEXT:    v_writelane_b32 v58, s34, 0
+; SI-NEXT:    v_writelane_b32 v58, s35, 1
+; SI-NEXT:    v_writelane_b32 v58, s36, 2
+; SI-NEXT:    v_writelane_b32 v58, s37, 3
+; SI-NEXT:    v_writelane_b32 v58, s38, 4
+; SI-NEXT:    v_writelane_b32 v58, s39, 5
+; SI-NEXT:    v_writelane_b32 v58, s48, 6
+; SI-NEXT:    v_writelane_b32 v58, s49, 7
+; SI-NEXT:    v_writelane_b32 v58, s50, 8
+; SI-NEXT:    v_writelane_b32 v58, s51, 9
+; SI-NEXT:    v_writelane_b32 v58, s52, 10
+; SI-NEXT:    v_writelane_b32 v58, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
-; SI-NEXT:    v_writelane_b32 v58, s52, 12
+; SI-NEXT:    v_writelane_b32 v58, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s41, v13
 ; SI-NEXT:    v_readfirstlane_b32 s40, v12
 ; SI-NEXT:    v_readfirstlane_b32 s15, v11
@@ -34320,7 +34320,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v58, s53, 13
+; SI-NEXT:    v_writelane_b32 v58, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s53, s41, 16
@@ -34580,6 +34580,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_readlane_b32 s30, v58, 12
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v39
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v38
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v37
@@ -34592,20 +34593,19 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_readlane_b32 s53, v58, 13
-; SI-NEXT:    v_readlane_b32 s52, v58, 12
-; SI-NEXT:    v_readlane_b32 s51, v58, 11
-; SI-NEXT:    v_readlane_b32 s50, v58, 10
-; SI-NEXT:    v_readlane_b32 s49, v58, 9
-; SI-NEXT:    v_readlane_b32 s48, v58, 8
-; SI-NEXT:    v_readlane_b32 s39, v58, 7
-; SI-NEXT:    v_readlane_b32 s38, v58, 6
-; SI-NEXT:    v_readlane_b32 s37, v58, 5
-; SI-NEXT:    v_readlane_b32 s36, v58, 4
-; SI-NEXT:    v_readlane_b32 s35, v58, 3
-; SI-NEXT:    v_readlane_b32 s34, v58, 2
-; SI-NEXT:    v_readlane_b32 s31, v58, 1
-; SI-NEXT:    v_readlane_b32 s30, v58, 0
+; SI-NEXT:    v_readlane_b32 s31, v58, 13
+; SI-NEXT:    v_readlane_b32 s53, v58, 11
+; SI-NEXT:    v_readlane_b32 s52, v58, 10
+; SI-NEXT:    v_readlane_b32 s51, v58, 9
+; SI-NEXT:    v_readlane_b32 s50, v58, 8
+; SI-NEXT:    v_readlane_b32 s49, v58, 7
+; SI-NEXT:    v_readlane_b32 s48, v58, 6
+; SI-NEXT:    v_readlane_b32 s39, v58, 5
+; SI-NEXT:    v_readlane_b32 s38, v58, 4
+; SI-NEXT:    v_readlane_b32 s37, v58, 3
+; SI-NEXT:    v_readlane_b32 s36, v58, 2
+; SI-NEXT:    v_readlane_b32 s35, v58, 1
+; SI-NEXT:    v_readlane_b32 s34, v58, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34618,10 +34618,10 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s9, v13
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
 ; VI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -34645,7 +34645,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s9, 16
@@ -34862,6 +34862,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v33
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v31
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v11, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -34873,10 +34874,9 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35550,7 +35550,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
+define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36795,7 +36795,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36803,37 +36803,38 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s30, 30
+; SI-NEXT:    v_writelane_b32 v28, s31, 31
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
 ; SI-NEXT:    v_readfirstlane_b32 s9, v12
 ; SI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -36848,7 +36849,6 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s31, v2
 ; SI-NEXT:    v_readfirstlane_b32 s69, v1
 ; SI-NEXT:    v_readfirstlane_b32 s80, v0
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
 ; SI-NEXT:    s_lshr_b32 s78, s29, 16
 ; SI-NEXT:    s_lshr_b32 s89, s28, 16
 ; SI-NEXT:    s_lshr_b32 s92, s27, 16
@@ -37108,6 +37108,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v28, 30
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -37136,38 +37137,37 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v25, s61
 ; SI-NEXT:    v_mov_b32_e32 v26, s62
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 31
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -37184,38 +37184,37 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
-; VI-NEXT:    v_writelane_b32 v28, s36, 4
-; VI-NEXT:    v_writelane_b32 v28, s37, 5
-; VI-NEXT:    v_writelane_b32 v28, s38, 6
-; VI-NEXT:    v_writelane_b32 v28, s39, 7
-; VI-NEXT:    v_writelane_b32 v28, s48, 8
-; VI-NEXT:    v_writelane_b32 v28, s49, 9
-; VI-NEXT:    v_writelane_b32 v28, s50, 10
-; VI-NEXT:    v_writelane_b32 v28, s51, 11
-; VI-NEXT:    v_writelane_b32 v28, s52, 12
-; VI-NEXT:    v_writelane_b32 v28, s53, 13
-; VI-NEXT:    v_writelane_b32 v28, s54, 14
-; VI-NEXT:    v_writelane_b32 v28, s55, 15
-; VI-NEXT:    v_writelane_b32 v28, s64, 16
-; VI-NEXT:    v_writelane_b32 v28, s65, 17
-; VI-NEXT:    v_writelane_b32 v28, s66, 18
-; VI-NEXT:    v_writelane_b32 v28, s67, 19
-; VI-NEXT:    v_writelane_b32 v28, s68, 20
-; VI-NEXT:    v_writelane_b32 v28, s69, 21
-; VI-NEXT:    v_writelane_b32 v28, s70, 22
-; VI-NEXT:    v_writelane_b32 v28, s71, 23
-; VI-NEXT:    v_writelane_b32 v28, s80, 24
-; VI-NEXT:    v_writelane_b32 v28, s81, 25
-; VI-NEXT:    v_writelane_b32 v28, s82, 26
-; VI-NEXT:    v_writelane_b32 v28, s83, 27
-; VI-NEXT:    v_writelane_b32 v28, s84, 28
-; VI-NEXT:    v_writelane_b32 v28, s85, 29
-; VI-NEXT:    v_writelane_b32 v28, s86, 30
-; VI-NEXT:    v_writelane_b32 v28, s87, 31
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s36, 2
+; VI-NEXT:    v_writelane_b32 v28, s37, 3
+; VI-NEXT:    v_writelane_b32 v28, s38, 4
+; VI-NEXT:    v_writelane_b32 v28, s39, 5
+; VI-NEXT:    v_writelane_b32 v28, s48, 6
+; VI-NEXT:    v_writelane_b32 v28, s49, 7
+; VI-NEXT:    v_writelane_b32 v28, s50, 8
+; VI-NEXT:    v_writelane_b32 v28, s51, 9
+; VI-NEXT:    v_writelane_b32 v28, s52, 10
+; VI-NEXT:    v_writelane_b32 v28, s53, 11
+; VI-NEXT:    v_writelane_b32 v28, s54, 12
+; VI-NEXT:    v_writelane_b32 v28, s55, 13
+; VI-NEXT:    v_writelane_b32 v28, s64, 14
+; VI-NEXT:    v_writelane_b32 v28, s65, 15
+; VI-NEXT:    v_writelane_b32 v28, s66, 16
+; VI-NEXT:    v_writelane_b32 v28, s67, 17
+; VI-NEXT:    v_writelane_b32 v28, s68, 18
+; VI-NEXT:    v_writelane_b32 v28, s69, 19
+; VI-NEXT:    v_writelane_b32 v28, s70, 20
+; VI-NEXT:    v_writelane_b32 v28, s71, 21
+; VI-NEXT:    v_writelane_b32 v28, s80, 22
+; VI-NEXT:    v_writelane_b32 v28, s81, 23
+; VI-NEXT:    v_writelane_b32 v28, s82, 24
+; VI-NEXT:    v_writelane_b32 v28, s83, 25
+; VI-NEXT:    v_writelane_b32 v28, s84, 26
+; VI-NEXT:    v_writelane_b32 v28, s85, 27
+; VI-NEXT:    v_writelane_b32 v28, s86, 28
+; VI-NEXT:    v_writelane_b32 v28, s87, 29
+; VI-NEXT:    v_writelane_b32 v28, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s86, v13
 ; VI-NEXT:    v_readfirstlane_b32 s6, v12
 ; VI-NEXT:    v_readfirstlane_b32 s9, v11
@@ -37230,6 +37229,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s69, v2
 ; VI-NEXT:    v_readfirstlane_b32 s81, v1
 ; VI-NEXT:    v_readfirstlane_b32 s84, v0
+; VI-NEXT:    v_writelane_b32 v28, s31, 31
 ; VI-NEXT:    s_lshr_b32 s79, s29, 16
 ; VI-NEXT:    s_lshr_b32 s90, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -37489,6 +37489,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s63, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v28, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -37517,38 +37518,37 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v25, s61
 ; VI-NEXT:    v_mov_b32_e32 v26, s62
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
-; VI-NEXT:    v_readlane_b32 s87, v28, 31
-; VI-NEXT:    v_readlane_b32 s86, v28, 30
-; VI-NEXT:    v_readlane_b32 s85, v28, 29
-; VI-NEXT:    v_readlane_b32 s84, v28, 28
-; VI-NEXT:    v_readlane_b32 s83, v28, 27
-; VI-NEXT:    v_readlane_b32 s82, v28, 26
-; VI-NEXT:    v_readlane_b32 s81, v28, 25
-; VI-NEXT:    v_readlane_b32 s80, v28, 24
-; VI-NEXT:    v_readlane_b32 s71, v28, 23
-; VI-NEXT:    v_readlane_b32 s70, v28, 22
-; VI-NEXT:    v_readlane_b32 s69, v28, 21
-; VI-NEXT:    v_readlane_b32 s68, v28, 20
-; VI-NEXT:    v_readlane_b32 s67, v28, 19
-; VI-NEXT:    v_readlane_b32 s66, v28, 18
-; VI-NEXT:    v_readlane_b32 s65, v28, 17
-; VI-NEXT:    v_readlane_b32 s64, v28, 16
-; VI-NEXT:    v_readlane_b32 s55, v28, 15
-; VI-NEXT:    v_readlane_b32 s54, v28, 14
-; VI-NEXT:    v_readlane_b32 s53, v28, 13
-; VI-NEXT:    v_readlane_b32 s52, v28, 12
-; VI-NEXT:    v_readlane_b32 s51, v28, 11
-; VI-NEXT:    v_readlane_b32 s50, v28, 10
-; VI-NEXT:    v_readlane_b32 s49, v28, 9
-; VI-NEXT:    v_readlane_b32 s48, v28, 8
-; VI-NEXT:    v_readlane_b32 s39, v28, 7
-; VI-NEXT:    v_readlane_b32 s38, v28, 6
-; VI-NEXT:    v_readlane_b32 s37, v28, 5
-; VI-NEXT:    v_readlane_b32 s36, v28, 4
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 31
+; VI-NEXT:    v_readlane_b32 s87, v28, 29
+; VI-NEXT:    v_readlane_b32 s86, v28, 28
+; VI-NEXT:    v_readlane_b32 s85, v28, 27
+; VI-NEXT:    v_readlane_b32 s84, v28, 26
+; VI-NEXT:    v_readlane_b32 s83, v28, 25
+; VI-NEXT:    v_readlane_b32 s82, v28, 24
+; VI-NEXT:    v_readlane_b32 s81, v28, 23
+; VI-NEXT:    v_readlane_b32 s80, v28, 22
+; VI-NEXT:    v_readlane_b32 s71, v28, 21
+; VI-NEXT:    v_readlane_b32 s70, v28, 20
+; VI-NEXT:    v_readlane_b32 s69, v28, 19
+; VI-NEXT:    v_readlane_b32 s68, v28, 18
+; VI-NEXT:    v_readlane_b32 s67, v28, 17
+; VI-NEXT:    v_readlane_b32 s66, v28, 16
+; VI-NEXT:    v_readlane_b32 s65, v28, 15
+; VI-NEXT:    v_readlane_b32 s64, v28, 14
+; VI-NEXT:    v_readlane_b32 s55, v28, 13
+; VI-NEXT:    v_readlane_b32 s54, v28, 12
+; VI-NEXT:    v_readlane_b32 s53, v28, 11
+; VI-NEXT:    v_readlane_b32 s52, v28, 10
+; VI-NEXT:    v_readlane_b32 s51, v28, 9
+; VI-NEXT:    v_readlane_b32 s50, v28, 8
+; VI-NEXT:    v_readlane_b32 s49, v28, 7
+; VI-NEXT:    v_readlane_b32 s48, v28, 6
+; VI-NEXT:    v_readlane_b32 s39, v28, 5
+; VI-NEXT:    v_readlane_b32 s38, v28, 4
+; VI-NEXT:    v_readlane_b32 s37, v28, 3
+; VI-NEXT:    v_readlane_b32 s36, v28, 2
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -37966,7 +37966,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
+define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38732,7 +38732,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v14f64_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38740,20 +38740,20 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v58, s30, 0
-; SI-NEXT:    v_writelane_b32 v58, s31, 1
-; SI-NEXT:    v_writelane_b32 v58, s34, 2
-; SI-NEXT:    v_writelane_b32 v58, s35, 3
-; SI-NEXT:    v_writelane_b32 v58, s36, 4
-; SI-NEXT:    v_writelane_b32 v58, s37, 5
-; SI-NEXT:    v_writelane_b32 v58, s38, 6
-; SI-NEXT:    v_writelane_b32 v58, s39, 7
-; SI-NEXT:    v_writelane_b32 v58, s48, 8
-; SI-NEXT:    v_writelane_b32 v58, s49, 9
-; SI-NEXT:    v_writelane_b32 v58, s50, 10
-; SI-NEXT:    v_writelane_b32 v58, s51, 11
+; SI-NEXT:    v_writelane_b32 v58, s34, 0
+; SI-NEXT:    v_writelane_b32 v58, s35, 1
+; SI-NEXT:    v_writelane_b32 v58, s36, 2
+; SI-NEXT:    v_writelane_b32 v58, s37, 3
+; SI-NEXT:    v_writelane_b32 v58, s38, 4
+; SI-NEXT:    v_writelane_b32 v58, s39, 5
+; SI-NEXT:    v_writelane_b32 v58, s48, 6
+; SI-NEXT:    v_writelane_b32 v58, s49, 7
+; SI-NEXT:    v_writelane_b32 v58, s50, 8
+; SI-NEXT:    v_writelane_b32 v58, s51, 9
+; SI-NEXT:    v_writelane_b32 v58, s52, 10
+; SI-NEXT:    v_writelane_b32 v58, s53, 11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
-; SI-NEXT:    v_writelane_b32 v58, s52, 12
+; SI-NEXT:    v_writelane_b32 v58, s30, 12
 ; SI-NEXT:    v_readfirstlane_b32 s41, v13
 ; SI-NEXT:    v_readfirstlane_b32 s40, v12
 ; SI-NEXT:    v_readfirstlane_b32 s15, v11
@@ -38779,7 +38779,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v58, s53, 13
+; SI-NEXT:    v_writelane_b32 v58, s31, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s53, s41, 16
@@ -39039,6 +39039,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_readlane_b32 s30, v58, 12
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v39
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v38
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v37
@@ -39051,20 +39052,19 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_readlane_b32 s53, v58, 13
-; SI-NEXT:    v_readlane_b32 s52, v58, 12
-; SI-NEXT:    v_readlane_b32 s51, v58, 11
-; SI-NEXT:    v_readlane_b32 s50, v58, 10
-; SI-NEXT:    v_readlane_b32 s49, v58, 9
-; SI-NEXT:    v_readlane_b32 s48, v58, 8
-; SI-NEXT:    v_readlane_b32 s39, v58, 7
-; SI-NEXT:    v_readlane_b32 s38, v58, 6
-; SI-NEXT:    v_readlane_b32 s37, v58, 5
-; SI-NEXT:    v_readlane_b32 s36, v58, 4
-; SI-NEXT:    v_readlane_b32 s35, v58, 3
-; SI-NEXT:    v_readlane_b32 s34, v58, 2
-; SI-NEXT:    v_readlane_b32 s31, v58, 1
-; SI-NEXT:    v_readlane_b32 s30, v58, 0
+; SI-NEXT:    v_readlane_b32 s31, v58, 13
+; SI-NEXT:    v_readlane_b32 s53, v58, 11
+; SI-NEXT:    v_readlane_b32 s52, v58, 10
+; SI-NEXT:    v_readlane_b32 s51, v58, 9
+; SI-NEXT:    v_readlane_b32 s50, v58, 8
+; SI-NEXT:    v_readlane_b32 s49, v58, 7
+; SI-NEXT:    v_readlane_b32 s48, v58, 6
+; SI-NEXT:    v_readlane_b32 s39, v58, 5
+; SI-NEXT:    v_readlane_b32 s38, v58, 4
+; SI-NEXT:    v_readlane_b32 s37, v58, 3
+; SI-NEXT:    v_readlane_b32 s36, v58, 2
+; SI-NEXT:    v_readlane_b32 s35, v58, 1
+; SI-NEXT:    v_readlane_b32 s34, v58, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39077,10 +39077,10 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
 ; VI-NEXT:    v_readfirstlane_b32 s4, v14
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s9, v13
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
 ; VI-NEXT:    v_readfirstlane_b32 s11, v11
@@ -39104,7 +39104,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s44, s9, 16
@@ -39321,6 +39321,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v33
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v31
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v11, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -39332,10 +39333,9 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -40009,7 +40009,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
+define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41391,7 +41391,7 @@ end:
   ret <14 x double> %phi
 }
 
-define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v14f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41399,37 +41399,37 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s30, 30
 ; SI-NEXT:    v_readfirstlane_b32 s6, v13
 ; SI-NEXT:    v_readfirstlane_b32 s8, v12
 ; SI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -41444,7 +41444,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s92, v2
 ; SI-NEXT:    v_readfirstlane_b32 s95, v1
 ; SI-NEXT:    v_readfirstlane_b32 s34, v0
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
+; SI-NEXT:    v_writelane_b32 v32, s31, 31
 ; SI-NEXT:    s_lshr_b32 s94, s29, 16
 ; SI-NEXT:    s_lshr_b32 s30, s28, 16
 ; SI-NEXT:    s_lshr_b32 s35, s27, 16
@@ -41825,38 +41825,38 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 30
+; SI-NEXT:    v_readlane_b32 s31, v32, 31
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -41870,41 +41870,41 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s15, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    v_readfirstlane_b32 s6, v13
 ; VI-NEXT:    s_lshr_b32 s61, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v10
@@ -41919,7 +41919,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s80, v1
 ; VI-NEXT:    v_readfirstlane_b32 s83, v0
 ; VI-NEXT:    v_writelane_b32 v33, s15, 0
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s75, s28, 16
 ; VI-NEXT:    s_lshr_b32 s90, s27, 16
@@ -42206,38 +42206,38 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -42566,7 +42566,7 @@ end:
   ret <14 x double> %phi
 }
 
-define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
+define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v56f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43917,7 +43917,7 @@ end:
   ret <56 x half> %phi
 }
 
-define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56i16_to_v56f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43926,42 +43926,41 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v28, s30, 0
-; SI-NEXT:    v_writelane_b32 v28, s31, 1
-; SI-NEXT:    v_writelane_b32 v28, s34, 2
-; SI-NEXT:    v_writelane_b32 v28, s35, 3
-; SI-NEXT:    v_writelane_b32 v28, s36, 4
-; SI-NEXT:    v_writelane_b32 v28, s37, 5
-; SI-NEXT:    v_writelane_b32 v28, s38, 6
-; SI-NEXT:    v_writelane_b32 v28, s39, 7
-; SI-NEXT:    v_writelane_b32 v28, s48, 8
-; SI-NEXT:    v_writelane_b32 v28, s49, 9
-; SI-NEXT:    v_writelane_b32 v28, s50, 10
-; SI-NEXT:    v_writelane_b32 v28, s51, 11
-; SI-NEXT:    v_writelane_b32 v28, s52, 12
-; SI-NEXT:    v_writelane_b32 v28, s53, 13
-; SI-NEXT:    v_writelane_b32 v28, s54, 14
-; SI-NEXT:    v_writelane_b32 v28, s55, 15
-; SI-NEXT:    v_writelane_b32 v28, s64, 16
-; SI-NEXT:    v_writelane_b32 v28, s65, 17
-; SI-NEXT:    v_writelane_b32 v28, s66, 18
-; SI-NEXT:    v_writelane_b32 v28, s67, 19
-; SI-NEXT:    v_writelane_b32 v28, s68, 20
-; SI-NEXT:    v_writelane_b32 v28, s69, 21
-; SI-NEXT:    v_writelane_b32 v28, s70, 22
-; SI-NEXT:    v_writelane_b32 v28, s71, 23
-; SI-NEXT:    v_writelane_b32 v28, s80, 24
-; SI-NEXT:    v_writelane_b32 v28, s81, 25
-; SI-NEXT:    v_writelane_b32 v28, s82, 26
-; SI-NEXT:    v_writelane_b32 v28, s83, 27
-; SI-NEXT:    v_writelane_b32 v28, s84, 28
-; SI-NEXT:    v_writelane_b32 v28, s85, 29
-; SI-NEXT:    v_writelane_b32 v28, s86, 30
-; SI-NEXT:    v_writelane_b32 v28, s87, 31
-; SI-NEXT:    v_writelane_b32 v28, s96, 32
-; SI-NEXT:    v_writelane_b32 v28, s97, 33
-; SI-NEXT:    v_writelane_b32 v28, s98, 34
-; SI-NEXT:    v_writelane_b32 v28, s99, 35
+; SI-NEXT:    v_writelane_b32 v28, s34, 0
+; SI-NEXT:    v_writelane_b32 v28, s35, 1
+; SI-NEXT:    v_writelane_b32 v28, s36, 2
+; SI-NEXT:    v_writelane_b32 v28, s37, 3
+; SI-NEXT:    v_writelane_b32 v28, s38, 4
+; SI-NEXT:    v_writelane_b32 v28, s39, 5
+; SI-NEXT:    v_writelane_b32 v28, s48, 6
+; SI-NEXT:    v_writelane_b32 v28, s49, 7
+; SI-NEXT:    v_writelane_b32 v28, s50, 8
+; SI-NEXT:    v_writelane_b32 v28, s51, 9
+; SI-NEXT:    v_writelane_b32 v28, s52, 10
+; SI-NEXT:    v_writelane_b32 v28, s53, 11
+; SI-NEXT:    v_writelane_b32 v28, s54, 12
+; SI-NEXT:    v_writelane_b32 v28, s55, 13
+; SI-NEXT:    v_writelane_b32 v28, s64, 14
+; SI-NEXT:    v_writelane_b32 v28, s65, 15
+; SI-NEXT:    v_writelane_b32 v28, s66, 16
+; SI-NEXT:    v_writelane_b32 v28, s67, 17
+; SI-NEXT:    v_writelane_b32 v28, s68, 18
+; SI-NEXT:    v_writelane_b32 v28, s69, 19
+; SI-NEXT:    v_writelane_b32 v28, s70, 20
+; SI-NEXT:    v_writelane_b32 v28, s71, 21
+; SI-NEXT:    v_writelane_b32 v28, s80, 22
+; SI-NEXT:    v_writelane_b32 v28, s81, 23
+; SI-NEXT:    v_writelane_b32 v28, s82, 24
+; SI-NEXT:    v_writelane_b32 v28, s83, 25
+; SI-NEXT:    v_writelane_b32 v28, s84, 26
+; SI-NEXT:    v_writelane_b32 v28, s85, 27
+; SI-NEXT:    v_writelane_b32 v28, s86, 28
+; SI-NEXT:    v_writelane_b32 v28, s87, 29
+; SI-NEXT:    v_writelane_b32 v28, s96, 30
+; SI-NEXT:    v_writelane_b32 v28, s97, 31
+; SI-NEXT:    v_writelane_b32 v28, s98, 32
+; SI-NEXT:    v_writelane_b32 v28, s99, 33
+; SI-NEXT:    v_writelane_b32 v28, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s54, v13
 ; SI-NEXT:    v_readfirstlane_b32 s55, v12
 ; SI-NEXT:    v_readfirstlane_b32 s52, v11
@@ -43976,6 +43975,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s84, v2
 ; SI-NEXT:    v_readfirstlane_b32 s97, v1
 ; SI-NEXT:    v_readfirstlane_b32 s99, v0
+; SI-NEXT:    v_writelane_b32 v28, s31, 35
 ; SI-NEXT:    s_lshr_b32 s64, s29, 16
 ; SI-NEXT:    s_lshr_b32 s37, s28, 16
 ; SI-NEXT:    s_lshr_b32 s65, s27, 16
@@ -44441,6 +44441,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; SI-NEXT:    s_lshl_b32 s42, s78, 16
 ; SI-NEXT:    s_or_b32 s11, s11, s42
+; SI-NEXT:    v_readlane_b32 s30, v28, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
@@ -44469,42 +44470,41 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v25, s13
 ; SI-NEXT:    v_mov_b32_e32 v26, s10
 ; SI-NEXT:    v_mov_b32_e32 v27, s11
-; SI-NEXT:    v_readlane_b32 s99, v28, 35
-; SI-NEXT:    v_readlane_b32 s98, v28, 34
-; SI-NEXT:    v_readlane_b32 s97, v28, 33
-; SI-NEXT:    v_readlane_b32 s96, v28, 32
-; SI-NEXT:    v_readlane_b32 s87, v28, 31
-; SI-NEXT:    v_readlane_b32 s86, v28, 30
-; SI-NEXT:    v_readlane_b32 s85, v28, 29
-; SI-NEXT:    v_readlane_b32 s84, v28, 28
-; SI-NEXT:    v_readlane_b32 s83, v28, 27
-; SI-NEXT:    v_readlane_b32 s82, v28, 26
-; SI-NEXT:    v_readlane_b32 s81, v28, 25
-; SI-NEXT:    v_readlane_b32 s80, v28, 24
-; SI-NEXT:    v_readlane_b32 s71, v28, 23
-; SI-NEXT:    v_readlane_b32 s70, v28, 22
-; SI-NEXT:    v_readlane_b32 s69, v28, 21
-; SI-NEXT:    v_readlane_b32 s68, v28, 20
-; SI-NEXT:    v_readlane_b32 s67, v28, 19
-; SI-NEXT:    v_readlane_b32 s66, v28, 18
-; SI-NEXT:    v_readlane_b32 s65, v28, 17
-; SI-NEXT:    v_readlane_b32 s64, v28, 16
-; SI-NEXT:    v_readlane_b32 s55, v28, 15
-; SI-NEXT:    v_readlane_b32 s54, v28, 14
-; SI-NEXT:    v_readlane_b32 s53, v28, 13
-; SI-NEXT:    v_readlane_b32 s52, v28, 12
-; SI-NEXT:    v_readlane_b32 s51, v28, 11
-; SI-NEXT:    v_readlane_b32 s50, v28, 10
-; SI-NEXT:    v_readlane_b32 s49, v28, 9
-; SI-NEXT:    v_readlane_b32 s48, v28, 8
-; SI-NEXT:    v_readlane_b32 s39, v28, 7
-; SI-NEXT:    v_readlane_b32 s38, v28, 6
-; SI-NEXT:    v_readlane_b32 s37, v28, 5
-; SI-NEXT:    v_readlane_b32 s36, v28, 4
-; SI-NEXT:    v_readlane_b32 s35, v28, 3
-; SI-NEXT:    v_readlane_b32 s34, v28, 2
-; SI-NEXT:    v_readlane_b32 s31, v28, 1
-; SI-NEXT:    v_readlane_b32 s30, v28, 0
+; SI-NEXT:    v_readlane_b32 s31, v28, 35
+; SI-NEXT:    v_readlane_b32 s99, v28, 33
+; SI-NEXT:    v_readlane_b32 s98, v28, 32
+; SI-NEXT:    v_readlane_b32 s97, v28, 31
+; SI-NEXT:    v_readlane_b32 s96, v28, 30
+; SI-NEXT:    v_readlane_b32 s87, v28, 29
+; SI-NEXT:    v_readlane_b32 s86, v28, 28
+; SI-NEXT:    v_readlane_b32 s85, v28, 27
+; SI-NEXT:    v_readlane_b32 s84, v28, 26
+; SI-NEXT:    v_readlane_b32 s83, v28, 25
+; SI-NEXT:    v_readlane_b32 s82, v28, 24
+; SI-NEXT:    v_readlane_b32 s81, v28, 23
+; SI-NEXT:    v_readlane_b32 s80, v28, 22
+; SI-NEXT:    v_readlane_b32 s71, v28, 21
+; SI-NEXT:    v_readlane_b32 s70, v28, 20
+; SI-NEXT:    v_readlane_b32 s69, v28, 19
+; SI-NEXT:    v_readlane_b32 s68, v28, 18
+; SI-NEXT:    v_readlane_b32 s67, v28, 17
+; SI-NEXT:    v_readlane_b32 s66, v28, 16
+; SI-NEXT:    v_readlane_b32 s65, v28, 15
+; SI-NEXT:    v_readlane_b32 s64, v28, 14
+; SI-NEXT:    v_readlane_b32 s55, v28, 13
+; SI-NEXT:    v_readlane_b32 s54, v28, 12
+; SI-NEXT:    v_readlane_b32 s53, v28, 11
+; SI-NEXT:    v_readlane_b32 s52, v28, 10
+; SI-NEXT:    v_readlane_b32 s51, v28, 9
+; SI-NEXT:    v_readlane_b32 s50, v28, 8
+; SI-NEXT:    v_readlane_b32 s49, v28, 7
+; SI-NEXT:    v_readlane_b32 s48, v28, 6
+; SI-NEXT:    v_readlane_b32 s39, v28, 5
+; SI-NEXT:    v_readlane_b32 s38, v28, 4
+; SI-NEXT:    v_readlane_b32 s37, v28, 3
+; SI-NEXT:    v_readlane_b32 s36, v28, 2
+; SI-NEXT:    v_readlane_b32 s35, v28, 1
+; SI-NEXT:    v_readlane_b32 s34, v28, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -44559,9 +44559,9 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v28, s30, 0
-; VI-NEXT:    v_writelane_b32 v28, s31, 1
-; VI-NEXT:    v_writelane_b32 v28, s34, 2
+; VI-NEXT:    v_writelane_b32 v28, s34, 0
+; VI-NEXT:    v_writelane_b32 v28, s35, 1
+; VI-NEXT:    v_writelane_b32 v28, s30, 2
 ; VI-NEXT:    v_readfirstlane_b32 s7, v13
 ; VI-NEXT:    v_readfirstlane_b32 s8, v12
 ; VI-NEXT:    v_readfirstlane_b32 s10, v11
@@ -44576,7 +44576,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s76, v2
 ; VI-NEXT:    v_readfirstlane_b32 s79, v1
 ; VI-NEXT:    v_readfirstlane_b32 s90, v0
-; VI-NEXT:    v_writelane_b32 v28, s35, 3
+; VI-NEXT:    v_writelane_b32 v28, s31, 3
 ; VI-NEXT:    s_lshr_b32 s42, s29, 16
 ; VI-NEXT:    s_lshr_b32 s45, s28, 16
 ; VI-NEXT:    s_lshr_b32 s46, s27, 16
@@ -44752,6 +44752,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; VI-NEXT:    s_or_b32 s10, s10, s11
 ; VI-NEXT:    s_or_b32 s8, s8, s9
 ; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    v_readlane_b32 s30, v28, 2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -44780,10 +44781,9 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v25, s10
 ; VI-NEXT:    v_mov_b32_e32 v26, s8
 ; VI-NEXT:    v_mov_b32_e32 v27, s6
-; VI-NEXT:    v_readlane_b32 s35, v28, 3
-; VI-NEXT:    v_readlane_b32 s34, v28, 2
-; VI-NEXT:    v_readlane_b32 s31, v28, 1
-; VI-NEXT:    v_readlane_b32 s30, v28, 0
+; VI-NEXT:    v_readlane_b32 s31, v28, 3
+; VI-NEXT:    v_readlane_b32 s35, v28, 1
+; VI-NEXT:    v_readlane_b32 s34, v28, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45501,7 +45501,7 @@ end:
   ret <56 x half> %phi
 }
 
-define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
+define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v56i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46459,7 +46459,7 @@ end:
   ret <56 x i16> %phi
 }
 
-define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v56f16_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46963,9 +46963,10 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v56, s30, 0
-; VI-NEXT:    v_writelane_b32 v56, s31, 1
-; VI-NEXT:    v_writelane_b32 v56, s34, 2
+; VI-NEXT:    v_writelane_b32 v56, s34, 0
+; VI-NEXT:    v_writelane_b32 v56, s35, 1
+; VI-NEXT:    v_writelane_b32 v56, s30, 2
+; VI-NEXT:    v_writelane_b32 v56, s31, 3
 ; VI-NEXT:    v_readfirstlane_b32 s44, v13
 ; VI-NEXT:    v_readfirstlane_b32 s46, v12
 ; VI-NEXT:    v_readfirstlane_b32 s56, v11
@@ -46980,7 +46981,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s90, v2
 ; VI-NEXT:    v_readfirstlane_b32 s30, v1
 ; VI-NEXT:    v_readfirstlane_b32 s34, v0
-; VI-NEXT:    v_writelane_b32 v56, s35, 3
 ; VI-NEXT:    s_lshr_b32 s6, s29, 16
 ; VI-NEXT:    s_lshr_b32 s7, s28, 16
 ; VI-NEXT:    s_lshr_b32 s8, s27, 16
@@ -47185,6 +47185,7 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; VI-NEXT:    v_readlane_b32 s30, v56, 2
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -47205,10 +47206,9 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s35, v56, 3
-; VI-NEXT:    v_readlane_b32 s34, v56, 2
-; VI-NEXT:    v_readlane_b32 s31, v56, 1
-; VI-NEXT:    v_readlane_b32 s30, v56, 0
+; VI-NEXT:    v_readlane_b32 s31, v56, 3
+; VI-NEXT:    v_readlane_b32 s35, v56, 1
+; VI-NEXT:    v_readlane_b32 s34, v56, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -47924,3 +47924,5 @@ end:
   %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <56 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index d6cf0367f1b20..8c5d66f1227ea 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) {
+define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -196,7 +196,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -557,7 +557,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) {
+define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -732,7 +732,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1308,7 +1308,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) {
+define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1498,7 +1498,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1859,7 +1859,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) {
+define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2057,7 +2057,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2418,7 +2418,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) {
+define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2608,7 +2608,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2969,7 +2969,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) {
+define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3099,7 +3099,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3615,7 +3615,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
+define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4523,7 +4523,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4531,23 +4531,23 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
+; SI-NEXT:    v_writelane_b32 v30, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -4565,7 +4565,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    s_cmp_lg_u32 s42, 0
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
+; SI-NEXT:    v_writelane_b32 v30, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s34, s5, 16
@@ -4751,6 +4751,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s44, s34, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s45
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -4781,23 +4782,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 16
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -4842,14 +4842,14 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -4867,7 +4867,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s45, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -5053,6 +5053,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s44, s46, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s44
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -5083,14 +5084,13 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v27, s8
 ; VI-NEXT:    v_mov_b32_e32 v28, s7
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5135,10 +5135,10 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v30, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v30, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v30, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v30, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v30, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v30, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -5156,7 +5156,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v0
-; GFX9-NEXT:    v_writelane_b32 v30, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v30, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -5282,6 +5282,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s56
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s47
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s46
+; GFX9-NEXT:    v_readlane_b32 s30, v30, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
@@ -5312,10 +5313,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v27, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v28, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s6
-; GFX9-NEXT:    v_readlane_b32 s35, v30, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v30, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v30, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v30, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v30, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v30, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v30, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -5564,7 +5564,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
+define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6909,7 +6909,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6917,41 +6917,42 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
 ; SI-NEXT:    v_readfirstlane_b32 s9, v14
 ; SI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -6968,7 +6969,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s70, v2
 ; SI-NEXT:    v_readfirstlane_b32 s81, v1
 ; SI-NEXT:    v_readfirstlane_b32 s84, v0
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
 ; SI-NEXT:    s_lshr_b32 s94, s27, 16
@@ -7246,6 +7246,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; SI-NEXT:  .LBB15_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -7276,42 +7277,41 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
 ; SI-NEXT:    v_mov_b32_e32 v28, s64
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -7328,47 +7328,48 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
-; VI-NEXT:    v_writelane_b32 v30, s48, 8
-; VI-NEXT:    v_writelane_b32 v30, s49, 9
-; VI-NEXT:    v_writelane_b32 v30, s50, 10
-; VI-NEXT:    v_writelane_b32 v30, s51, 11
-; VI-NEXT:    v_writelane_b32 v30, s52, 12
-; VI-NEXT:    v_writelane_b32 v30, s53, 13
-; VI-NEXT:    v_writelane_b32 v30, s54, 14
-; VI-NEXT:    v_writelane_b32 v30, s55, 15
-; VI-NEXT:    v_writelane_b32 v30, s64, 16
-; VI-NEXT:    v_writelane_b32 v30, s65, 17
-; VI-NEXT:    v_writelane_b32 v30, s66, 18
-; VI-NEXT:    v_writelane_b32 v30, s67, 19
-; VI-NEXT:    v_writelane_b32 v30, s68, 20
-; VI-NEXT:    v_writelane_b32 v30, s69, 21
-; VI-NEXT:    v_writelane_b32 v30, s70, 22
-; VI-NEXT:    v_writelane_b32 v30, s71, 23
-; VI-NEXT:    v_writelane_b32 v30, s80, 24
-; VI-NEXT:    v_writelane_b32 v30, s81, 25
-; VI-NEXT:    v_writelane_b32 v30, s82, 26
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s48, 6
+; VI-NEXT:    v_writelane_b32 v30, s49, 7
+; VI-NEXT:    v_writelane_b32 v30, s50, 8
+; VI-NEXT:    v_writelane_b32 v30, s51, 9
+; VI-NEXT:    v_writelane_b32 v30, s52, 10
+; VI-NEXT:    v_writelane_b32 v30, s53, 11
+; VI-NEXT:    v_writelane_b32 v30, s54, 12
+; VI-NEXT:    v_writelane_b32 v30, s55, 13
+; VI-NEXT:    v_writelane_b32 v30, s64, 14
+; VI-NEXT:    v_writelane_b32 v30, s65, 15
+; VI-NEXT:    v_writelane_b32 v30, s66, 16
+; VI-NEXT:    v_writelane_b32 v30, s67, 17
+; VI-NEXT:    v_writelane_b32 v30, s68, 18
+; VI-NEXT:    v_writelane_b32 v30, s69, 19
+; VI-NEXT:    v_writelane_b32 v30, s70, 20
+; VI-NEXT:    v_writelane_b32 v30, s71, 21
+; VI-NEXT:    v_writelane_b32 v30, s80, 22
+; VI-NEXT:    v_writelane_b32 v30, s81, 23
+; VI-NEXT:    v_writelane_b32 v30, s82, 24
+; VI-NEXT:    v_writelane_b32 v30, s83, 25
+; VI-NEXT:    v_writelane_b32 v30, s84, 26
+; VI-NEXT:    v_writelane_b32 v30, s85, 27
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
-; VI-NEXT:    v_writelane_b32 v30, s83, 27
+; VI-NEXT:    v_writelane_b32 v30, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s11, 16
 ; VI-NEXT:    v_readfirstlane_b32 s13, v12
 ; VI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v30, s84, 28
+; VI-NEXT:    v_writelane_b32 v30, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s13, 16
 ; VI-NEXT:    v_readfirstlane_b32 s15, v11
 ; VI-NEXT:    v_writelane_b32 v31, s63, 0
-; VI-NEXT:    v_writelane_b32 v30, s85, 29
+; VI-NEXT:    v_writelane_b32 v30, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s15, 16
 ; VI-NEXT:    v_readfirstlane_b32 s73, v10
 ; VI-NEXT:    v_writelane_b32 v31, s62, 1
-; VI-NEXT:    v_writelane_b32 v30, s86, 30
+; VI-NEXT:    v_writelane_b32 v30, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    s_lshr_b32 s60, s73, 16
@@ -7383,7 +7384,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; VI-NEXT:    v_writelane_b32 v31, s61, 2
-; VI-NEXT:    v_writelane_b32 v30, s87, 31
 ; VI-NEXT:    s_lshr_b32 s90, s29, 16
 ; VI-NEXT:    s_lshr_b32 s30, s28, 16
 ; VI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -7666,6 +7666,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; VI-NEXT:  .LBB15_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v30, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -7696,38 +7697,37 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
 ; VI-NEXT:    v_mov_b32_e32 v28, s64
 ; VI-NEXT:    v_mov_b32_e32 v29, s65
-; VI-NEXT:    v_readlane_b32 s87, v30, 31
-; VI-NEXT:    v_readlane_b32 s86, v30, 30
-; VI-NEXT:    v_readlane_b32 s85, v30, 29
-; VI-NEXT:    v_readlane_b32 s84, v30, 28
-; VI-NEXT:    v_readlane_b32 s83, v30, 27
-; VI-NEXT:    v_readlane_b32 s82, v30, 26
-; VI-NEXT:    v_readlane_b32 s81, v30, 25
-; VI-NEXT:    v_readlane_b32 s80, v30, 24
-; VI-NEXT:    v_readlane_b32 s71, v30, 23
-; VI-NEXT:    v_readlane_b32 s70, v30, 22
-; VI-NEXT:    v_readlane_b32 s69, v30, 21
-; VI-NEXT:    v_readlane_b32 s68, v30, 20
-; VI-NEXT:    v_readlane_b32 s67, v30, 19
-; VI-NEXT:    v_readlane_b32 s66, v30, 18
-; VI-NEXT:    v_readlane_b32 s65, v30, 17
-; VI-NEXT:    v_readlane_b32 s64, v30, 16
-; VI-NEXT:    v_readlane_b32 s55, v30, 15
-; VI-NEXT:    v_readlane_b32 s54, v30, 14
-; VI-NEXT:    v_readlane_b32 s53, v30, 13
-; VI-NEXT:    v_readlane_b32 s52, v30, 12
-; VI-NEXT:    v_readlane_b32 s51, v30, 11
-; VI-NEXT:    v_readlane_b32 s50, v30, 10
-; VI-NEXT:    v_readlane_b32 s49, v30, 9
-; VI-NEXT:    v_readlane_b32 s48, v30, 8
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 31
+; VI-NEXT:    v_readlane_b32 s87, v30, 29
+; VI-NEXT:    v_readlane_b32 s86, v30, 28
+; VI-NEXT:    v_readlane_b32 s85, v30, 27
+; VI-NEXT:    v_readlane_b32 s84, v30, 26
+; VI-NEXT:    v_readlane_b32 s83, v30, 25
+; VI-NEXT:    v_readlane_b32 s82, v30, 24
+; VI-NEXT:    v_readlane_b32 s81, v30, 23
+; VI-NEXT:    v_readlane_b32 s80, v30, 22
+; VI-NEXT:    v_readlane_b32 s71, v30, 21
+; VI-NEXT:    v_readlane_b32 s70, v30, 20
+; VI-NEXT:    v_readlane_b32 s69, v30, 19
+; VI-NEXT:    v_readlane_b32 s68, v30, 18
+; VI-NEXT:    v_readlane_b32 s67, v30, 17
+; VI-NEXT:    v_readlane_b32 s66, v30, 16
+; VI-NEXT:    v_readlane_b32 s65, v30, 15
+; VI-NEXT:    v_readlane_b32 s64, v30, 14
+; VI-NEXT:    v_readlane_b32 s55, v30, 13
+; VI-NEXT:    v_readlane_b32 s54, v30, 12
+; VI-NEXT:    v_readlane_b32 s53, v30, 11
+; VI-NEXT:    v_readlane_b32 s52, v30, 10
+; VI-NEXT:    v_readlane_b32 s51, v30, 9
+; VI-NEXT:    v_readlane_b32 s50, v30, 8
+; VI-NEXT:    v_readlane_b32 s49, v30, 7
+; VI-NEXT:    v_readlane_b32 s48, v30, 6
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -8082,7 +8082,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
+define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8990,7 +8990,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30i32_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8998,23 +8998,23 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
+; SI-NEXT:    v_writelane_b32 v30, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -9032,7 +9032,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    s_cmp_lg_u32 s42, 0
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
+; SI-NEXT:    v_writelane_b32 v30, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s34, s5, 16
@@ -9218,6 +9218,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s44, s34, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s45
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -9248,23 +9249,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 16
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9309,14 +9309,14 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -9334,7 +9334,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s45, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -9520,6 +9520,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s44, s46, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s44
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -9550,14 +9551,13 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v27, s8
 ; VI-NEXT:    v_mov_b32_e32 v28, s7
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9602,10 +9602,10 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v30, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v30, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v30, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v30, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v30, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v30, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -9623,7 +9623,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v0
-; GFX9-NEXT:    v_writelane_b32 v30, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v30, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -9749,6 +9749,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s56
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s47
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s46
+; GFX9-NEXT:    v_readlane_b32 s30, v30, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
@@ -9779,10 +9780,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v27, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v28, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s6
-; GFX9-NEXT:    v_readlane_b32 s35, v30, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v30, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v30, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v30, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v30, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v30, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v30, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -10031,7 +10031,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
+define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v30i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11520,7 +11520,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v30i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11528,41 +11528,42 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s8, v14
 ; SI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -11579,7 +11580,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s35, v1
 ; SI-NEXT:    v_readfirstlane_b32 s70, v0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s31, s29, 16
 ; SI-NEXT:    s_lshr_b32 s68, s28, 16
 ; SI-NEXT:    s_lshr_b32 s71, s27, 16
@@ -11984,42 +11984,42 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB19_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -12033,53 +12033,54 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
 ; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
 ; VI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
 ; VI-NEXT:    v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v12
 ; VI-NEXT:    v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s12, 16
 ; VI-NEXT:    v_readfirstlane_b32 s14, v11
 ; VI-NEXT:    v_writelane_b32 v33, s63, 2
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s14, 16
 ; VI-NEXT:    v_readfirstlane_b32 s72, v10
 ; VI-NEXT:    v_writelane_b32 v33, s62, 3
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s60, s72, 16
 ; VI-NEXT:    v_readfirstlane_b32 s74, v9
 ; VI-NEXT:    v_readfirstlane_b32 s76, v8
@@ -12092,7 +12093,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
 ; VI-NEXT:    v_writelane_b32 v33, s61, 4
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s88, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -12391,38 +12391,38 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB19_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -12771,7 +12771,7 @@ end:
   ret <30 x i32> %phi
 }
 
-define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) {
+define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12946,7 +12946,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13522,7 +13522,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) {
+define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13720,7 +13720,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14081,7 +14081,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) {
+define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14256,7 +14256,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14832,7 +14832,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) {
+define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14962,7 +14962,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15478,7 +15478,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
+define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16356,7 +16356,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16364,23 +16364,23 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s30, 0
-; SI-NEXT:    v_writelane_b32 v62, s31, 1
-; SI-NEXT:    v_writelane_b32 v62, s34, 2
-; SI-NEXT:    v_writelane_b32 v62, s35, 3
-; SI-NEXT:    v_writelane_b32 v62, s36, 4
-; SI-NEXT:    v_writelane_b32 v62, s37, 5
-; SI-NEXT:    v_writelane_b32 v62, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s54, 14
+; SI-NEXT:    v_writelane_b32 v62, s34, 0
+; SI-NEXT:    v_writelane_b32 v62, s35, 1
+; SI-NEXT:    v_writelane_b32 v62, s36, 2
+; SI-NEXT:    v_writelane_b32 v62, s37, 3
+; SI-NEXT:    v_writelane_b32 v62, s38, 4
+; SI-NEXT:    v_writelane_b32 v62, s39, 5
+; SI-NEXT:    v_writelane_b32 v62, s48, 6
+; SI-NEXT:    v_writelane_b32 v62, s49, 7
+; SI-NEXT:    v_writelane_b32 v62, s50, 8
+; SI-NEXT:    v_writelane_b32 v62, s51, 9
+; SI-NEXT:    v_writelane_b32 v62, s52, 10
+; SI-NEXT:    v_writelane_b32 v62, s53, 11
+; SI-NEXT:    v_writelane_b32 v62, s54, 12
+; SI-NEXT:    v_writelane_b32 v62, s55, 13
+; SI-NEXT:    v_writelane_b32 v62, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v62, s55, 15
+; SI-NEXT:    v_writelane_b32 v62, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -16412,7 +16412,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s64, s5, 16
@@ -16704,6 +16704,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v30
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; SI-NEXT:    v_readlane_b32 s30, v62, 15
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v38
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v37
@@ -16714,23 +16715,22 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v32
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v31
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_readlane_b32 s64, v62, 16
-; SI-NEXT:    v_readlane_b32 s55, v62, 15
-; SI-NEXT:    v_readlane_b32 s54, v62, 14
-; SI-NEXT:    v_readlane_b32 s53, v62, 13
-; SI-NEXT:    v_readlane_b32 s52, v62, 12
-; SI-NEXT:    v_readlane_b32 s51, v62, 11
-; SI-NEXT:    v_readlane_b32 s50, v62, 10
-; SI-NEXT:    v_readlane_b32 s49, v62, 9
-; SI-NEXT:    v_readlane_b32 s48, v62, 8
-; SI-NEXT:    v_readlane_b32 s39, v62, 7
-; SI-NEXT:    v_readlane_b32 s38, v62, 6
-; SI-NEXT:    v_readlane_b32 s37, v62, 5
-; SI-NEXT:    v_readlane_b32 s36, v62, 4
-; SI-NEXT:    v_readlane_b32 s35, v62, 3
-; SI-NEXT:    v_readlane_b32 s34, v62, 2
-; SI-NEXT:    v_readlane_b32 s31, v62, 1
-; SI-NEXT:    v_readlane_b32 s30, v62, 0
+; SI-NEXT:    v_readlane_b32 s31, v62, 16
+; SI-NEXT:    v_readlane_b32 s64, v62, 14
+; SI-NEXT:    v_readlane_b32 s55, v62, 13
+; SI-NEXT:    v_readlane_b32 s54, v62, 12
+; SI-NEXT:    v_readlane_b32 s53, v62, 11
+; SI-NEXT:    v_readlane_b32 s52, v62, 10
+; SI-NEXT:    v_readlane_b32 s51, v62, 9
+; SI-NEXT:    v_readlane_b32 s50, v62, 8
+; SI-NEXT:    v_readlane_b32 s49, v62, 7
+; SI-NEXT:    v_readlane_b32 s48, v62, 6
+; SI-NEXT:    v_readlane_b32 s39, v62, 5
+; SI-NEXT:    v_readlane_b32 s38, v62, 4
+; SI-NEXT:    v_readlane_b32 s37, v62, 3
+; SI-NEXT:    v_readlane_b32 s36, v62, 2
+; SI-NEXT:    v_readlane_b32 s35, v62, 1
+; SI-NEXT:    v_readlane_b32 s34, v62, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -16743,14 +16743,14 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -16780,7 +16780,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -17024,6 +17024,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -17042,14 +17043,13 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17062,10 +17062,10 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -17095,7 +17095,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB29_3
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -17339,6 +17339,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v55, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v53, 16, v14
@@ -17357,10 +17358,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v32, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v31, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v30, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17834,7 +17834,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
+define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19179,7 +19179,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19187,41 +19187,42 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
 ; SI-NEXT:    v_readfirstlane_b32 s9, v14
 ; SI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -19238,7 +19239,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s70, v2
 ; SI-NEXT:    v_readfirstlane_b32 s81, v1
 ; SI-NEXT:    v_readfirstlane_b32 s84, v0
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
 ; SI-NEXT:    s_lshr_b32 s94, s27, 16
@@ -19516,6 +19516,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; SI-NEXT:  .LBB31_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -19546,42 +19547,41 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
 ; SI-NEXT:    v_mov_b32_e32 v28, s64
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19598,47 +19598,48 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
-; VI-NEXT:    v_writelane_b32 v30, s48, 8
-; VI-NEXT:    v_writelane_b32 v30, s49, 9
-; VI-NEXT:    v_writelane_b32 v30, s50, 10
-; VI-NEXT:    v_writelane_b32 v30, s51, 11
-; VI-NEXT:    v_writelane_b32 v30, s52, 12
-; VI-NEXT:    v_writelane_b32 v30, s53, 13
-; VI-NEXT:    v_writelane_b32 v30, s54, 14
-; VI-NEXT:    v_writelane_b32 v30, s55, 15
-; VI-NEXT:    v_writelane_b32 v30, s64, 16
-; VI-NEXT:    v_writelane_b32 v30, s65, 17
-; VI-NEXT:    v_writelane_b32 v30, s66, 18
-; VI-NEXT:    v_writelane_b32 v30, s67, 19
-; VI-NEXT:    v_writelane_b32 v30, s68, 20
-; VI-NEXT:    v_writelane_b32 v30, s69, 21
-; VI-NEXT:    v_writelane_b32 v30, s70, 22
-; VI-NEXT:    v_writelane_b32 v30, s71, 23
-; VI-NEXT:    v_writelane_b32 v30, s80, 24
-; VI-NEXT:    v_writelane_b32 v30, s81, 25
-; VI-NEXT:    v_writelane_b32 v30, s82, 26
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s48, 6
+; VI-NEXT:    v_writelane_b32 v30, s49, 7
+; VI-NEXT:    v_writelane_b32 v30, s50, 8
+; VI-NEXT:    v_writelane_b32 v30, s51, 9
+; VI-NEXT:    v_writelane_b32 v30, s52, 10
+; VI-NEXT:    v_writelane_b32 v30, s53, 11
+; VI-NEXT:    v_writelane_b32 v30, s54, 12
+; VI-NEXT:    v_writelane_b32 v30, s55, 13
+; VI-NEXT:    v_writelane_b32 v30, s64, 14
+; VI-NEXT:    v_writelane_b32 v30, s65, 15
+; VI-NEXT:    v_writelane_b32 v30, s66, 16
+; VI-NEXT:    v_writelane_b32 v30, s67, 17
+; VI-NEXT:    v_writelane_b32 v30, s68, 18
+; VI-NEXT:    v_writelane_b32 v30, s69, 19
+; VI-NEXT:    v_writelane_b32 v30, s70, 20
+; VI-NEXT:    v_writelane_b32 v30, s71, 21
+; VI-NEXT:    v_writelane_b32 v30, s80, 22
+; VI-NEXT:    v_writelane_b32 v30, s81, 23
+; VI-NEXT:    v_writelane_b32 v30, s82, 24
+; VI-NEXT:    v_writelane_b32 v30, s83, 25
+; VI-NEXT:    v_writelane_b32 v30, s84, 26
+; VI-NEXT:    v_writelane_b32 v30, s85, 27
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
-; VI-NEXT:    v_writelane_b32 v30, s83, 27
+; VI-NEXT:    v_writelane_b32 v30, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s11, 16
 ; VI-NEXT:    v_readfirstlane_b32 s13, v12
 ; VI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v30, s84, 28
+; VI-NEXT:    v_writelane_b32 v30, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s13, 16
 ; VI-NEXT:    v_readfirstlane_b32 s15, v11
 ; VI-NEXT:    v_writelane_b32 v31, s63, 0
-; VI-NEXT:    v_writelane_b32 v30, s85, 29
+; VI-NEXT:    v_writelane_b32 v30, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s15, 16
 ; VI-NEXT:    v_readfirstlane_b32 s73, v10
 ; VI-NEXT:    v_writelane_b32 v31, s62, 1
-; VI-NEXT:    v_writelane_b32 v30, s86, 30
+; VI-NEXT:    v_writelane_b32 v30, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    s_lshr_b32 s60, s73, 16
@@ -19653,7 +19654,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; VI-NEXT:    v_writelane_b32 v31, s61, 2
-; VI-NEXT:    v_writelane_b32 v30, s87, 31
 ; VI-NEXT:    s_lshr_b32 s90, s29, 16
 ; VI-NEXT:    s_lshr_b32 s30, s28, 16
 ; VI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -19936,6 +19936,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; VI-NEXT:  .LBB31_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v30, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -19966,38 +19967,37 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
 ; VI-NEXT:    v_mov_b32_e32 v28, s64
 ; VI-NEXT:    v_mov_b32_e32 v29, s65
-; VI-NEXT:    v_readlane_b32 s87, v30, 31
-; VI-NEXT:    v_readlane_b32 s86, v30, 30
-; VI-NEXT:    v_readlane_b32 s85, v30, 29
-; VI-NEXT:    v_readlane_b32 s84, v30, 28
-; VI-NEXT:    v_readlane_b32 s83, v30, 27
-; VI-NEXT:    v_readlane_b32 s82, v30, 26
-; VI-NEXT:    v_readlane_b32 s81, v30, 25
-; VI-NEXT:    v_readlane_b32 s80, v30, 24
-; VI-NEXT:    v_readlane_b32 s71, v30, 23
-; VI-NEXT:    v_readlane_b32 s70, v30, 22
-; VI-NEXT:    v_readlane_b32 s69, v30, 21
-; VI-NEXT:    v_readlane_b32 s68, v30, 20
-; VI-NEXT:    v_readlane_b32 s67, v30, 19
-; VI-NEXT:    v_readlane_b32 s66, v30, 18
-; VI-NEXT:    v_readlane_b32 s65, v30, 17
-; VI-NEXT:    v_readlane_b32 s64, v30, 16
-; VI-NEXT:    v_readlane_b32 s55, v30, 15
-; VI-NEXT:    v_readlane_b32 s54, v30, 14
-; VI-NEXT:    v_readlane_b32 s53, v30, 13
-; VI-NEXT:    v_readlane_b32 s52, v30, 12
-; VI-NEXT:    v_readlane_b32 s51, v30, 11
-; VI-NEXT:    v_readlane_b32 s50, v30, 10
-; VI-NEXT:    v_readlane_b32 s49, v30, 9
-; VI-NEXT:    v_readlane_b32 s48, v30, 8
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 31
+; VI-NEXT:    v_readlane_b32 s87, v30, 29
+; VI-NEXT:    v_readlane_b32 s86, v30, 28
+; VI-NEXT:    v_readlane_b32 s85, v30, 27
+; VI-NEXT:    v_readlane_b32 s84, v30, 26
+; VI-NEXT:    v_readlane_b32 s83, v30, 25
+; VI-NEXT:    v_readlane_b32 s82, v30, 24
+; VI-NEXT:    v_readlane_b32 s81, v30, 23
+; VI-NEXT:    v_readlane_b32 s80, v30, 22
+; VI-NEXT:    v_readlane_b32 s71, v30, 21
+; VI-NEXT:    v_readlane_b32 s70, v30, 20
+; VI-NEXT:    v_readlane_b32 s69, v30, 19
+; VI-NEXT:    v_readlane_b32 s68, v30, 18
+; VI-NEXT:    v_readlane_b32 s67, v30, 17
+; VI-NEXT:    v_readlane_b32 s66, v30, 16
+; VI-NEXT:    v_readlane_b32 s65, v30, 15
+; VI-NEXT:    v_readlane_b32 s64, v30, 14
+; VI-NEXT:    v_readlane_b32 s55, v30, 13
+; VI-NEXT:    v_readlane_b32 s54, v30, 12
+; VI-NEXT:    v_readlane_b32 s53, v30, 11
+; VI-NEXT:    v_readlane_b32 s52, v30, 10
+; VI-NEXT:    v_readlane_b32 s51, v30, 9
+; VI-NEXT:    v_readlane_b32 s50, v30, 8
+; VI-NEXT:    v_readlane_b32 s49, v30, 7
+; VI-NEXT:    v_readlane_b32 s48, v30, 6
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -20352,7 +20352,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
+define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21230,7 +21230,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v30f32_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21238,23 +21238,23 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s30, 0
-; SI-NEXT:    v_writelane_b32 v62, s31, 1
-; SI-NEXT:    v_writelane_b32 v62, s34, 2
-; SI-NEXT:    v_writelane_b32 v62, s35, 3
-; SI-NEXT:    v_writelane_b32 v62, s36, 4
-; SI-NEXT:    v_writelane_b32 v62, s37, 5
-; SI-NEXT:    v_writelane_b32 v62, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s54, 14
+; SI-NEXT:    v_writelane_b32 v62, s34, 0
+; SI-NEXT:    v_writelane_b32 v62, s35, 1
+; SI-NEXT:    v_writelane_b32 v62, s36, 2
+; SI-NEXT:    v_writelane_b32 v62, s37, 3
+; SI-NEXT:    v_writelane_b32 v62, s38, 4
+; SI-NEXT:    v_writelane_b32 v62, s39, 5
+; SI-NEXT:    v_writelane_b32 v62, s48, 6
+; SI-NEXT:    v_writelane_b32 v62, s49, 7
+; SI-NEXT:    v_writelane_b32 v62, s50, 8
+; SI-NEXT:    v_writelane_b32 v62, s51, 9
+; SI-NEXT:    v_writelane_b32 v62, s52, 10
+; SI-NEXT:    v_writelane_b32 v62, s53, 11
+; SI-NEXT:    v_writelane_b32 v62, s54, 12
+; SI-NEXT:    v_writelane_b32 v62, s55, 13
+; SI-NEXT:    v_writelane_b32 v62, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v62, s55, 15
+; SI-NEXT:    v_writelane_b32 v62, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -21286,7 +21286,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s64, s5, 16
@@ -21578,6 +21578,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v30
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; SI-NEXT:    v_readlane_b32 s30, v62, 15
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v38
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v37
@@ -21588,23 +21589,22 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v32
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v31
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_readlane_b32 s64, v62, 16
-; SI-NEXT:    v_readlane_b32 s55, v62, 15
-; SI-NEXT:    v_readlane_b32 s54, v62, 14
-; SI-NEXT:    v_readlane_b32 s53, v62, 13
-; SI-NEXT:    v_readlane_b32 s52, v62, 12
-; SI-NEXT:    v_readlane_b32 s51, v62, 11
-; SI-NEXT:    v_readlane_b32 s50, v62, 10
-; SI-NEXT:    v_readlane_b32 s49, v62, 9
-; SI-NEXT:    v_readlane_b32 s48, v62, 8
-; SI-NEXT:    v_readlane_b32 s39, v62, 7
-; SI-NEXT:    v_readlane_b32 s38, v62, 6
-; SI-NEXT:    v_readlane_b32 s37, v62, 5
-; SI-NEXT:    v_readlane_b32 s36, v62, 4
-; SI-NEXT:    v_readlane_b32 s35, v62, 3
-; SI-NEXT:    v_readlane_b32 s34, v62, 2
-; SI-NEXT:    v_readlane_b32 s31, v62, 1
-; SI-NEXT:    v_readlane_b32 s30, v62, 0
+; SI-NEXT:    v_readlane_b32 s31, v62, 16
+; SI-NEXT:    v_readlane_b32 s64, v62, 14
+; SI-NEXT:    v_readlane_b32 s55, v62, 13
+; SI-NEXT:    v_readlane_b32 s54, v62, 12
+; SI-NEXT:    v_readlane_b32 s53, v62, 11
+; SI-NEXT:    v_readlane_b32 s52, v62, 10
+; SI-NEXT:    v_readlane_b32 s51, v62, 9
+; SI-NEXT:    v_readlane_b32 s50, v62, 8
+; SI-NEXT:    v_readlane_b32 s49, v62, 7
+; SI-NEXT:    v_readlane_b32 s48, v62, 6
+; SI-NEXT:    v_readlane_b32 s39, v62, 5
+; SI-NEXT:    v_readlane_b32 s38, v62, 4
+; SI-NEXT:    v_readlane_b32 s37, v62, 3
+; SI-NEXT:    v_readlane_b32 s36, v62, 2
+; SI-NEXT:    v_readlane_b32 s35, v62, 1
+; SI-NEXT:    v_readlane_b32 s34, v62, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21617,14 +21617,14 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -21654,7 +21654,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -21898,6 +21898,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -21916,14 +21917,13 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -21936,10 +21936,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -21969,7 +21969,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB33_3
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -22213,6 +22213,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v55, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v53, 16, v14
@@ -22231,10 +22232,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v32, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v31, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v30, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -22708,7 +22708,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
+define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v30f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24197,7 +24197,7 @@ end:
   ret <30 x float> %phi
 }
 
-define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v30f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24205,41 +24205,42 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s8, v14
 ; SI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -24256,7 +24257,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s35, v1
 ; SI-NEXT:    v_readfirstlane_b32 s70, v0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s31, s29, 16
 ; SI-NEXT:    s_lshr_b32 s68, s28, 16
 ; SI-NEXT:    s_lshr_b32 s71, s27, 16
@@ -24661,42 +24661,42 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB35_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24710,53 +24710,54 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
 ; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
 ; VI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
 ; VI-NEXT:    v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v12
 ; VI-NEXT:    v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s12, 16
 ; VI-NEXT:    v_readfirstlane_b32 s14, v11
 ; VI-NEXT:    v_writelane_b32 v33, s63, 2
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s14, 16
 ; VI-NEXT:    v_readfirstlane_b32 s72, v10
 ; VI-NEXT:    v_writelane_b32 v33, s62, 3
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s60, s72, 16
 ; VI-NEXT:    v_readfirstlane_b32 s74, v9
 ; VI-NEXT:    v_readfirstlane_b32 s76, v8
@@ -24769,7 +24770,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
 ; VI-NEXT:    v_writelane_b32 v33, s61, 4
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s88, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -25068,38 +25068,38 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB35_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -25448,7 +25448,7 @@ end:
   ret <30 x float> %phi
 }
 
-define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) {
+define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25646,7 +25646,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26006,7 +26006,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) {
+define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26136,7 +26136,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26652,7 +26652,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
+define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27576,7 +27576,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27584,23 +27584,23 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
+; SI-NEXT:    v_writelane_b32 v30, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -27618,7 +27618,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    s_cmp_lg_u32 s42, 0
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
+; SI-NEXT:    v_writelane_b32 v30, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s34, s5, 16
@@ -27804,6 +27804,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s44, s34, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s45
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -27834,23 +27835,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 16
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27895,14 +27895,14 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -27920,7 +27920,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s45, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -28106,6 +28106,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s44, s46, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s44
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -28136,14 +28137,13 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v27, s8
 ; VI-NEXT:    v_mov_b32_e32 v28, s7
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28188,10 +28188,10 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v30, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v30, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v30, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v30, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v30, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v30, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -28209,7 +28209,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v0
-; GFX9-NEXT:    v_writelane_b32 v30, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v30, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -28335,6 +28335,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s56
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s47
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s46
+; GFX9-NEXT:    v_readlane_b32 s30, v30, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
@@ -28365,10 +28366,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v27, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v28, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s6
-; GFX9-NEXT:    v_readlane_b32 s35, v30, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v30, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v30, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v30, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v30, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v30, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v30, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28617,7 +28617,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
+define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29962,7 +29962,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29970,41 +29970,42 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
 ; SI-NEXT:    v_readfirstlane_b32 s9, v14
 ; SI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -30021,7 +30022,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s70, v2
 ; SI-NEXT:    v_readfirstlane_b32 s81, v1
 ; SI-NEXT:    v_readfirstlane_b32 s84, v0
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
 ; SI-NEXT:    s_lshr_b32 s94, s27, 16
@@ -30299,6 +30299,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -30329,42 +30330,41 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
 ; SI-NEXT:    v_mov_b32_e32 v28, s64
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -30381,47 +30381,48 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
-; VI-NEXT:    v_writelane_b32 v30, s48, 8
-; VI-NEXT:    v_writelane_b32 v30, s49, 9
-; VI-NEXT:    v_writelane_b32 v30, s50, 10
-; VI-NEXT:    v_writelane_b32 v30, s51, 11
-; VI-NEXT:    v_writelane_b32 v30, s52, 12
-; VI-NEXT:    v_writelane_b32 v30, s53, 13
-; VI-NEXT:    v_writelane_b32 v30, s54, 14
-; VI-NEXT:    v_writelane_b32 v30, s55, 15
-; VI-NEXT:    v_writelane_b32 v30, s64, 16
-; VI-NEXT:    v_writelane_b32 v30, s65, 17
-; VI-NEXT:    v_writelane_b32 v30, s66, 18
-; VI-NEXT:    v_writelane_b32 v30, s67, 19
-; VI-NEXT:    v_writelane_b32 v30, s68, 20
-; VI-NEXT:    v_writelane_b32 v30, s69, 21
-; VI-NEXT:    v_writelane_b32 v30, s70, 22
-; VI-NEXT:    v_writelane_b32 v30, s71, 23
-; VI-NEXT:    v_writelane_b32 v30, s80, 24
-; VI-NEXT:    v_writelane_b32 v30, s81, 25
-; VI-NEXT:    v_writelane_b32 v30, s82, 26
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s48, 6
+; VI-NEXT:    v_writelane_b32 v30, s49, 7
+; VI-NEXT:    v_writelane_b32 v30, s50, 8
+; VI-NEXT:    v_writelane_b32 v30, s51, 9
+; VI-NEXT:    v_writelane_b32 v30, s52, 10
+; VI-NEXT:    v_writelane_b32 v30, s53, 11
+; VI-NEXT:    v_writelane_b32 v30, s54, 12
+; VI-NEXT:    v_writelane_b32 v30, s55, 13
+; VI-NEXT:    v_writelane_b32 v30, s64, 14
+; VI-NEXT:    v_writelane_b32 v30, s65, 15
+; VI-NEXT:    v_writelane_b32 v30, s66, 16
+; VI-NEXT:    v_writelane_b32 v30, s67, 17
+; VI-NEXT:    v_writelane_b32 v30, s68, 18
+; VI-NEXT:    v_writelane_b32 v30, s69, 19
+; VI-NEXT:    v_writelane_b32 v30, s70, 20
+; VI-NEXT:    v_writelane_b32 v30, s71, 21
+; VI-NEXT:    v_writelane_b32 v30, s80, 22
+; VI-NEXT:    v_writelane_b32 v30, s81, 23
+; VI-NEXT:    v_writelane_b32 v30, s82, 24
+; VI-NEXT:    v_writelane_b32 v30, s83, 25
+; VI-NEXT:    v_writelane_b32 v30, s84, 26
+; VI-NEXT:    v_writelane_b32 v30, s85, 27
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
-; VI-NEXT:    v_writelane_b32 v30, s83, 27
+; VI-NEXT:    v_writelane_b32 v30, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s11, 16
 ; VI-NEXT:    v_readfirstlane_b32 s13, v12
 ; VI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v30, s84, 28
+; VI-NEXT:    v_writelane_b32 v30, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s13, 16
 ; VI-NEXT:    v_readfirstlane_b32 s15, v11
 ; VI-NEXT:    v_writelane_b32 v31, s63, 0
-; VI-NEXT:    v_writelane_b32 v30, s85, 29
+; VI-NEXT:    v_writelane_b32 v30, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s15, 16
 ; VI-NEXT:    v_readfirstlane_b32 s73, v10
 ; VI-NEXT:    v_writelane_b32 v31, s62, 1
-; VI-NEXT:    v_writelane_b32 v30, s86, 30
+; VI-NEXT:    v_writelane_b32 v30, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    s_lshr_b32 s60, s73, 16
@@ -30436,7 +30437,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; VI-NEXT:    v_writelane_b32 v31, s61, 2
-; VI-NEXT:    v_writelane_b32 v30, s87, 31
 ; VI-NEXT:    s_lshr_b32 s90, s29, 16
 ; VI-NEXT:    s_lshr_b32 s30, s28, 16
 ; VI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -30719,6 +30719,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v30, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -30749,38 +30750,37 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
 ; VI-NEXT:    v_mov_b32_e32 v28, s64
 ; VI-NEXT:    v_mov_b32_e32 v29, s65
-; VI-NEXT:    v_readlane_b32 s87, v30, 31
-; VI-NEXT:    v_readlane_b32 s86, v30, 30
-; VI-NEXT:    v_readlane_b32 s85, v30, 29
-; VI-NEXT:    v_readlane_b32 s84, v30, 28
-; VI-NEXT:    v_readlane_b32 s83, v30, 27
-; VI-NEXT:    v_readlane_b32 s82, v30, 26
-; VI-NEXT:    v_readlane_b32 s81, v30, 25
-; VI-NEXT:    v_readlane_b32 s80, v30, 24
-; VI-NEXT:    v_readlane_b32 s71, v30, 23
-; VI-NEXT:    v_readlane_b32 s70, v30, 22
-; VI-NEXT:    v_readlane_b32 s69, v30, 21
-; VI-NEXT:    v_readlane_b32 s68, v30, 20
-; VI-NEXT:    v_readlane_b32 s67, v30, 19
-; VI-NEXT:    v_readlane_b32 s66, v30, 18
-; VI-NEXT:    v_readlane_b32 s65, v30, 17
-; VI-NEXT:    v_readlane_b32 s64, v30, 16
-; VI-NEXT:    v_readlane_b32 s55, v30, 15
-; VI-NEXT:    v_readlane_b32 s54, v30, 14
-; VI-NEXT:    v_readlane_b32 s53, v30, 13
-; VI-NEXT:    v_readlane_b32 s52, v30, 12
-; VI-NEXT:    v_readlane_b32 s51, v30, 11
-; VI-NEXT:    v_readlane_b32 s50, v30, 10
-; VI-NEXT:    v_readlane_b32 s49, v30, 9
-; VI-NEXT:    v_readlane_b32 s48, v30, 8
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 31
+; VI-NEXT:    v_readlane_b32 s87, v30, 29
+; VI-NEXT:    v_readlane_b32 s86, v30, 28
+; VI-NEXT:    v_readlane_b32 s85, v30, 27
+; VI-NEXT:    v_readlane_b32 s84, v30, 26
+; VI-NEXT:    v_readlane_b32 s83, v30, 25
+; VI-NEXT:    v_readlane_b32 s82, v30, 24
+; VI-NEXT:    v_readlane_b32 s81, v30, 23
+; VI-NEXT:    v_readlane_b32 s80, v30, 22
+; VI-NEXT:    v_readlane_b32 s71, v30, 21
+; VI-NEXT:    v_readlane_b32 s70, v30, 20
+; VI-NEXT:    v_readlane_b32 s69, v30, 19
+; VI-NEXT:    v_readlane_b32 s68, v30, 18
+; VI-NEXT:    v_readlane_b32 s67, v30, 17
+; VI-NEXT:    v_readlane_b32 s66, v30, 16
+; VI-NEXT:    v_readlane_b32 s65, v30, 15
+; VI-NEXT:    v_readlane_b32 s64, v30, 14
+; VI-NEXT:    v_readlane_b32 s55, v30, 13
+; VI-NEXT:    v_readlane_b32 s54, v30, 12
+; VI-NEXT:    v_readlane_b32 s53, v30, 11
+; VI-NEXT:    v_readlane_b32 s52, v30, 10
+; VI-NEXT:    v_readlane_b32 s51, v30, 9
+; VI-NEXT:    v_readlane_b32 s50, v30, 8
+; VI-NEXT:    v_readlane_b32 s49, v30, 7
+; VI-NEXT:    v_readlane_b32 s48, v30, 6
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -31135,7 +31135,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
+define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32059,7 +32059,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15i64_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32067,23 +32067,23 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
+; SI-NEXT:    v_writelane_b32 v30, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s5, v15
 ; SI-NEXT:    v_readfirstlane_b32 s4, v14
 ; SI-NEXT:    v_readfirstlane_b32 s7, v13
@@ -32101,7 +32101,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    s_cmp_lg_u32 s42, 0
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
+; SI-NEXT:    v_writelane_b32 v30, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s34, s5, 16
@@ -32287,6 +32287,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s44, s34, 16
 ; SI-NEXT:    s_or_b32 s7, s7, s45
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_mov_b32_e32 v1, s17
 ; SI-NEXT:    v_mov_b32_e32 v2, s18
@@ -32317,23 +32318,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 16
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32378,14 +32378,14 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
 ; VI-NEXT:    v_readfirstlane_b32 s7, v14
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
@@ -32403,7 +32403,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s44, v1
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_readfirstlane_b32 s45, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s6, 16
@@ -32589,6 +32589,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; VI-NEXT:    s_and_b32 s6, 0xffff, s6
 ; VI-NEXT:    s_lshl_b32 s44, s46, 16
 ; VI-NEXT:    s_or_b32 s6, s6, s44
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -32619,14 +32620,13 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v27, s8
 ; VI-NEXT:    v_mov_b32_e32 v28, s7
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -32671,10 +32671,10 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v30, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v30, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v30, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v30, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v30, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v30, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v13
@@ -32692,7 +32692,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX9-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s45, v0
-; GFX9-NEXT:    v_writelane_b32 v30, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v30, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s6, 16
@@ -32818,6 +32818,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s8, s56
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s47
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s6, s46
+; GFX9-NEXT:    v_readlane_b32 s30, v30, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
@@ -32848,10 +32849,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v27, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v28, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s6
-; GFX9-NEXT:    v_readlane_b32 s35, v30, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v30, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v30, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v30, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v30, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v30, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v30, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -33100,7 +33100,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
+define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v15i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34589,7 +34589,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v15i64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34597,41 +34597,42 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s8, v14
 ; SI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -34648,7 +34649,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s35, v1
 ; SI-NEXT:    v_readfirstlane_b32 s70, v0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s31, s29, 16
 ; SI-NEXT:    s_lshr_b32 s68, s28, 16
 ; SI-NEXT:    s_lshr_b32 s71, s27, 16
@@ -35053,42 +35053,42 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB47_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -35102,53 +35102,54 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
 ; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
 ; VI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
 ; VI-NEXT:    v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v12
 ; VI-NEXT:    v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s12, 16
 ; VI-NEXT:    v_readfirstlane_b32 s14, v11
 ; VI-NEXT:    v_writelane_b32 v33, s63, 2
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s14, 16
 ; VI-NEXT:    v_readfirstlane_b32 s72, v10
 ; VI-NEXT:    v_writelane_b32 v33, s62, 3
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s60, s72, 16
 ; VI-NEXT:    v_readfirstlane_b32 s74, v9
 ; VI-NEXT:    v_readfirstlane_b32 s76, v8
@@ -35161,7 +35162,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
 ; VI-NEXT:    v_writelane_b32 v33, s61, 4
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s88, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -35460,38 +35460,38 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -35840,7 +35840,7 @@ end:
   ret <15 x i64> %phi
 }
 
-define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
+define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36673,7 +36673,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36681,23 +36681,23 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s30, 0
-; SI-NEXT:    v_writelane_b32 v62, s31, 1
-; SI-NEXT:    v_writelane_b32 v62, s34, 2
-; SI-NEXT:    v_writelane_b32 v62, s35, 3
-; SI-NEXT:    v_writelane_b32 v62, s36, 4
-; SI-NEXT:    v_writelane_b32 v62, s37, 5
-; SI-NEXT:    v_writelane_b32 v62, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s54, 14
+; SI-NEXT:    v_writelane_b32 v62, s34, 0
+; SI-NEXT:    v_writelane_b32 v62, s35, 1
+; SI-NEXT:    v_writelane_b32 v62, s36, 2
+; SI-NEXT:    v_writelane_b32 v62, s37, 3
+; SI-NEXT:    v_writelane_b32 v62, s38, 4
+; SI-NEXT:    v_writelane_b32 v62, s39, 5
+; SI-NEXT:    v_writelane_b32 v62, s48, 6
+; SI-NEXT:    v_writelane_b32 v62, s49, 7
+; SI-NEXT:    v_writelane_b32 v62, s50, 8
+; SI-NEXT:    v_writelane_b32 v62, s51, 9
+; SI-NEXT:    v_writelane_b32 v62, s52, 10
+; SI-NEXT:    v_writelane_b32 v62, s53, 11
+; SI-NEXT:    v_writelane_b32 v62, s54, 12
+; SI-NEXT:    v_writelane_b32 v62, s55, 13
+; SI-NEXT:    v_writelane_b32 v62, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v62, s55, 15
+; SI-NEXT:    v_writelane_b32 v62, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s15, v15
 ; SI-NEXT:    v_readfirstlane_b32 s14, v14
 ; SI-NEXT:    v_readfirstlane_b32 s41, v13
@@ -36729,7 +36729,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s64, s15, 16
@@ -37012,6 +37012,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v30
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; SI-NEXT:    v_readlane_b32 s30, v62, 15
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v38
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v37
@@ -37022,23 +37023,22 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v32
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v31
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_readlane_b32 s64, v62, 16
-; SI-NEXT:    v_readlane_b32 s55, v62, 15
-; SI-NEXT:    v_readlane_b32 s54, v62, 14
-; SI-NEXT:    v_readlane_b32 s53, v62, 13
-; SI-NEXT:    v_readlane_b32 s52, v62, 12
-; SI-NEXT:    v_readlane_b32 s51, v62, 11
-; SI-NEXT:    v_readlane_b32 s50, v62, 10
-; SI-NEXT:    v_readlane_b32 s49, v62, 9
-; SI-NEXT:    v_readlane_b32 s48, v62, 8
-; SI-NEXT:    v_readlane_b32 s39, v62, 7
-; SI-NEXT:    v_readlane_b32 s38, v62, 6
-; SI-NEXT:    v_readlane_b32 s37, v62, 5
-; SI-NEXT:    v_readlane_b32 s36, v62, 4
-; SI-NEXT:    v_readlane_b32 s35, v62, 3
-; SI-NEXT:    v_readlane_b32 s34, v62, 2
-; SI-NEXT:    v_readlane_b32 s31, v62, 1
-; SI-NEXT:    v_readlane_b32 s30, v62, 0
+; SI-NEXT:    v_readlane_b32 s31, v62, 16
+; SI-NEXT:    v_readlane_b32 s64, v62, 14
+; SI-NEXT:    v_readlane_b32 s55, v62, 13
+; SI-NEXT:    v_readlane_b32 s54, v62, 12
+; SI-NEXT:    v_readlane_b32 s53, v62, 11
+; SI-NEXT:    v_readlane_b32 s52, v62, 10
+; SI-NEXT:    v_readlane_b32 s51, v62, 9
+; SI-NEXT:    v_readlane_b32 s50, v62, 8
+; SI-NEXT:    v_readlane_b32 s49, v62, 7
+; SI-NEXT:    v_readlane_b32 s48, v62, 6
+; SI-NEXT:    v_readlane_b32 s39, v62, 5
+; SI-NEXT:    v_readlane_b32 s38, v62, 4
+; SI-NEXT:    v_readlane_b32 s37, v62, 3
+; SI-NEXT:    v_readlane_b32 s36, v62, 2
+; SI-NEXT:    v_readlane_b32 s35, v62, 1
+; SI-NEXT:    v_readlane_b32 s34, v62, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -37051,14 +37051,14 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s9, v15
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -37088,7 +37088,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s9, 16
@@ -37325,6 +37325,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v35
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v33
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v11, v11, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v15, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -37335,14 +37336,13 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -37355,10 +37355,10 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s11, v13
@@ -37388,7 +37388,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB49_3
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s9, 16
@@ -37617,6 +37617,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v10, v54, 16, v10
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v52, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v50, 16, v14
@@ -37635,10 +37636,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v35, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v30, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v33, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -38083,7 +38083,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
+define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39428,7 +39428,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39436,41 +39436,42 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s7, v15
 ; SI-NEXT:    v_readfirstlane_b32 s9, v14
 ; SI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -39487,7 +39488,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_readfirstlane_b32 s70, v2
 ; SI-NEXT:    v_readfirstlane_b32 s81, v1
 ; SI-NEXT:    v_readfirstlane_b32 s84, v0
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
 ; SI-NEXT:    s_lshr_b32 s90, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
 ; SI-NEXT:    s_lshr_b32 s94, s27, 16
@@ -39765,6 +39765,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_mov_b32_e32 v0, s36
 ; SI-NEXT:    v_mov_b32_e32 v1, s37
 ; SI-NEXT:    v_mov_b32_e32 v2, s38
@@ -39795,42 +39796,41 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v27, s63
 ; SI-NEXT:    v_mov_b32_e32 v28, s64
 ; SI-NEXT:    v_mov_b32_e32 v29, s65
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -39847,47 +39847,48 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
-; VI-NEXT:    v_writelane_b32 v30, s48, 8
-; VI-NEXT:    v_writelane_b32 v30, s49, 9
-; VI-NEXT:    v_writelane_b32 v30, s50, 10
-; VI-NEXT:    v_writelane_b32 v30, s51, 11
-; VI-NEXT:    v_writelane_b32 v30, s52, 12
-; VI-NEXT:    v_writelane_b32 v30, s53, 13
-; VI-NEXT:    v_writelane_b32 v30, s54, 14
-; VI-NEXT:    v_writelane_b32 v30, s55, 15
-; VI-NEXT:    v_writelane_b32 v30, s64, 16
-; VI-NEXT:    v_writelane_b32 v30, s65, 17
-; VI-NEXT:    v_writelane_b32 v30, s66, 18
-; VI-NEXT:    v_writelane_b32 v30, s67, 19
-; VI-NEXT:    v_writelane_b32 v30, s68, 20
-; VI-NEXT:    v_writelane_b32 v30, s69, 21
-; VI-NEXT:    v_writelane_b32 v30, s70, 22
-; VI-NEXT:    v_writelane_b32 v30, s71, 23
-; VI-NEXT:    v_writelane_b32 v30, s80, 24
-; VI-NEXT:    v_writelane_b32 v30, s81, 25
-; VI-NEXT:    v_writelane_b32 v30, s82, 26
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s48, 6
+; VI-NEXT:    v_writelane_b32 v30, s49, 7
+; VI-NEXT:    v_writelane_b32 v30, s50, 8
+; VI-NEXT:    v_writelane_b32 v30, s51, 9
+; VI-NEXT:    v_writelane_b32 v30, s52, 10
+; VI-NEXT:    v_writelane_b32 v30, s53, 11
+; VI-NEXT:    v_writelane_b32 v30, s54, 12
+; VI-NEXT:    v_writelane_b32 v30, s55, 13
+; VI-NEXT:    v_writelane_b32 v30, s64, 14
+; VI-NEXT:    v_writelane_b32 v30, s65, 15
+; VI-NEXT:    v_writelane_b32 v30, s66, 16
+; VI-NEXT:    v_writelane_b32 v30, s67, 17
+; VI-NEXT:    v_writelane_b32 v30, s68, 18
+; VI-NEXT:    v_writelane_b32 v30, s69, 19
+; VI-NEXT:    v_writelane_b32 v30, s70, 20
+; VI-NEXT:    v_writelane_b32 v30, s71, 21
+; VI-NEXT:    v_writelane_b32 v30, s80, 22
+; VI-NEXT:    v_writelane_b32 v30, s81, 23
+; VI-NEXT:    v_writelane_b32 v30, s82, 24
+; VI-NEXT:    v_writelane_b32 v30, s83, 25
+; VI-NEXT:    v_writelane_b32 v30, s84, 26
+; VI-NEXT:    v_writelane_b32 v30, s85, 27
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
-; VI-NEXT:    v_writelane_b32 v30, s83, 27
+; VI-NEXT:    v_writelane_b32 v30, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s11, 16
 ; VI-NEXT:    v_readfirstlane_b32 s13, v12
 ; VI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v30, s84, 28
+; VI-NEXT:    v_writelane_b32 v30, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s13, 16
 ; VI-NEXT:    v_readfirstlane_b32 s15, v11
 ; VI-NEXT:    v_writelane_b32 v31, s63, 0
-; VI-NEXT:    v_writelane_b32 v30, s85, 29
+; VI-NEXT:    v_writelane_b32 v30, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s15, 16
 ; VI-NEXT:    v_readfirstlane_b32 s73, v10
 ; VI-NEXT:    v_writelane_b32 v31, s62, 1
-; VI-NEXT:    v_writelane_b32 v30, s86, 30
+; VI-NEXT:    v_writelane_b32 v30, s31, 31
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    s_lshr_b32 s60, s73, 16
@@ -39902,7 +39903,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; VI-NEXT:    v_writelane_b32 v31, s61, 2
-; VI-NEXT:    v_writelane_b32 v30, s87, 31
 ; VI-NEXT:    s_lshr_b32 s90, s29, 16
 ; VI-NEXT:    s_lshr_b32 s30, s28, 16
 ; VI-NEXT:    s_lshr_b32 s34, s27, 16
@@ -40185,6 +40185,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s65, s4, 0x30000
 ; VI-NEXT:  .LBB51_3: ; %end
+; VI-NEXT:    v_readlane_b32 s30, v30, 30
 ; VI-NEXT:    v_mov_b32_e32 v0, s36
 ; VI-NEXT:    v_mov_b32_e32 v1, s37
 ; VI-NEXT:    v_mov_b32_e32 v2, s38
@@ -40215,38 +40216,37 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v27, s63
 ; VI-NEXT:    v_mov_b32_e32 v28, s64
 ; VI-NEXT:    v_mov_b32_e32 v29, s65
-; VI-NEXT:    v_readlane_b32 s87, v30, 31
-; VI-NEXT:    v_readlane_b32 s86, v30, 30
-; VI-NEXT:    v_readlane_b32 s85, v30, 29
-; VI-NEXT:    v_readlane_b32 s84, v30, 28
-; VI-NEXT:    v_readlane_b32 s83, v30, 27
-; VI-NEXT:    v_readlane_b32 s82, v30, 26
-; VI-NEXT:    v_readlane_b32 s81, v30, 25
-; VI-NEXT:    v_readlane_b32 s80, v30, 24
-; VI-NEXT:    v_readlane_b32 s71, v30, 23
-; VI-NEXT:    v_readlane_b32 s70, v30, 22
-; VI-NEXT:    v_readlane_b32 s69, v30, 21
-; VI-NEXT:    v_readlane_b32 s68, v30, 20
-; VI-NEXT:    v_readlane_b32 s67, v30, 19
-; VI-NEXT:    v_readlane_b32 s66, v30, 18
-; VI-NEXT:    v_readlane_b32 s65, v30, 17
-; VI-NEXT:    v_readlane_b32 s64, v30, 16
-; VI-NEXT:    v_readlane_b32 s55, v30, 15
-; VI-NEXT:    v_readlane_b32 s54, v30, 14
-; VI-NEXT:    v_readlane_b32 s53, v30, 13
-; VI-NEXT:    v_readlane_b32 s52, v30, 12
-; VI-NEXT:    v_readlane_b32 s51, v30, 11
-; VI-NEXT:    v_readlane_b32 s50, v30, 10
-; VI-NEXT:    v_readlane_b32 s49, v30, 9
-; VI-NEXT:    v_readlane_b32 s48, v30, 8
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 31
+; VI-NEXT:    v_readlane_b32 s87, v30, 29
+; VI-NEXT:    v_readlane_b32 s86, v30, 28
+; VI-NEXT:    v_readlane_b32 s85, v30, 27
+; VI-NEXT:    v_readlane_b32 s84, v30, 26
+; VI-NEXT:    v_readlane_b32 s83, v30, 25
+; VI-NEXT:    v_readlane_b32 s82, v30, 24
+; VI-NEXT:    v_readlane_b32 s81, v30, 23
+; VI-NEXT:    v_readlane_b32 s80, v30, 22
+; VI-NEXT:    v_readlane_b32 s71, v30, 21
+; VI-NEXT:    v_readlane_b32 s70, v30, 20
+; VI-NEXT:    v_readlane_b32 s69, v30, 19
+; VI-NEXT:    v_readlane_b32 s68, v30, 18
+; VI-NEXT:    v_readlane_b32 s67, v30, 17
+; VI-NEXT:    v_readlane_b32 s66, v30, 16
+; VI-NEXT:    v_readlane_b32 s65, v30, 15
+; VI-NEXT:    v_readlane_b32 s64, v30, 14
+; VI-NEXT:    v_readlane_b32 s55, v30, 13
+; VI-NEXT:    v_readlane_b32 s54, v30, 12
+; VI-NEXT:    v_readlane_b32 s53, v30, 11
+; VI-NEXT:    v_readlane_b32 s52, v30, 10
+; VI-NEXT:    v_readlane_b32 s51, v30, 9
+; VI-NEXT:    v_readlane_b32 s50, v30, 8
+; VI-NEXT:    v_readlane_b32 s49, v30, 7
+; VI-NEXT:    v_readlane_b32 s48, v30, 6
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -40601,7 +40601,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
+define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41434,7 +41434,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v15f64_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41442,23 +41442,23 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s30, 0
-; SI-NEXT:    v_writelane_b32 v62, s31, 1
-; SI-NEXT:    v_writelane_b32 v62, s34, 2
-; SI-NEXT:    v_writelane_b32 v62, s35, 3
-; SI-NEXT:    v_writelane_b32 v62, s36, 4
-; SI-NEXT:    v_writelane_b32 v62, s37, 5
-; SI-NEXT:    v_writelane_b32 v62, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s54, 14
+; SI-NEXT:    v_writelane_b32 v62, s34, 0
+; SI-NEXT:    v_writelane_b32 v62, s35, 1
+; SI-NEXT:    v_writelane_b32 v62, s36, 2
+; SI-NEXT:    v_writelane_b32 v62, s37, 3
+; SI-NEXT:    v_writelane_b32 v62, s38, 4
+; SI-NEXT:    v_writelane_b32 v62, s39, 5
+; SI-NEXT:    v_writelane_b32 v62, s48, 6
+; SI-NEXT:    v_writelane_b32 v62, s49, 7
+; SI-NEXT:    v_writelane_b32 v62, s50, 8
+; SI-NEXT:    v_writelane_b32 v62, s51, 9
+; SI-NEXT:    v_writelane_b32 v62, s52, 10
+; SI-NEXT:    v_writelane_b32 v62, s53, 11
+; SI-NEXT:    v_writelane_b32 v62, s54, 12
+; SI-NEXT:    v_writelane_b32 v62, s55, 13
+; SI-NEXT:    v_writelane_b32 v62, s64, 14
 ; SI-NEXT:    v_readfirstlane_b32 s42, v16
-; SI-NEXT:    v_writelane_b32 v62, s55, 15
+; SI-NEXT:    v_writelane_b32 v62, s30, 15
 ; SI-NEXT:    v_readfirstlane_b32 s15, v15
 ; SI-NEXT:    v_readfirstlane_b32 s14, v14
 ; SI-NEXT:    v_readfirstlane_b32 s41, v13
@@ -41490,7 +41490,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s31, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s64, s15, 16
@@ -41773,6 +41773,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v30
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; SI-NEXT:    v_readlane_b32 s30, v62, 15
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v38
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v37
@@ -41783,23 +41784,22 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v32
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v31
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_readlane_b32 s64, v62, 16
-; SI-NEXT:    v_readlane_b32 s55, v62, 15
-; SI-NEXT:    v_readlane_b32 s54, v62, 14
-; SI-NEXT:    v_readlane_b32 s53, v62, 13
-; SI-NEXT:    v_readlane_b32 s52, v62, 12
-; SI-NEXT:    v_readlane_b32 s51, v62, 11
-; SI-NEXT:    v_readlane_b32 s50, v62, 10
-; SI-NEXT:    v_readlane_b32 s49, v62, 9
-; SI-NEXT:    v_readlane_b32 s48, v62, 8
-; SI-NEXT:    v_readlane_b32 s39, v62, 7
-; SI-NEXT:    v_readlane_b32 s38, v62, 6
-; SI-NEXT:    v_readlane_b32 s37, v62, 5
-; SI-NEXT:    v_readlane_b32 s36, v62, 4
-; SI-NEXT:    v_readlane_b32 s35, v62, 3
-; SI-NEXT:    v_readlane_b32 s34, v62, 2
-; SI-NEXT:    v_readlane_b32 s31, v62, 1
-; SI-NEXT:    v_readlane_b32 s30, v62, 0
+; SI-NEXT:    v_readlane_b32 s31, v62, 16
+; SI-NEXT:    v_readlane_b32 s64, v62, 14
+; SI-NEXT:    v_readlane_b32 s55, v62, 13
+; SI-NEXT:    v_readlane_b32 s54, v62, 12
+; SI-NEXT:    v_readlane_b32 s53, v62, 11
+; SI-NEXT:    v_readlane_b32 s52, v62, 10
+; SI-NEXT:    v_readlane_b32 s51, v62, 9
+; SI-NEXT:    v_readlane_b32 s50, v62, 8
+; SI-NEXT:    v_readlane_b32 s49, v62, 7
+; SI-NEXT:    v_readlane_b32 s48, v62, 6
+; SI-NEXT:    v_readlane_b32 s39, v62, 5
+; SI-NEXT:    v_readlane_b32 s38, v62, 4
+; SI-NEXT:    v_readlane_b32 s37, v62, 3
+; SI-NEXT:    v_readlane_b32 s36, v62, 2
+; SI-NEXT:    v_readlane_b32 s35, v62, 1
+; SI-NEXT:    v_readlane_b32 s34, v62, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -41812,14 +41812,14 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
 ; VI-NEXT:    v_readfirstlane_b32 s4, v16
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
 ; VI-NEXT:    v_readfirstlane_b32 s9, v15
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    v_readfirstlane_b32 s11, v13
@@ -41849,7 +41849,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    s_cbranch_scc0 .LBB53_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b32 s46, s9, 16
@@ -42086,6 +42086,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v35
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v33
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v11, v11, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v15, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -42096,14 +42097,13 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -42116,10 +42116,10 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v16
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s11, v13
@@ -42149,7 +42149,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB53_3
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_lshr_b32 s46, s9, 16
@@ -42378,6 +42378,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v10, v54, 16, v10
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v52, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v50, 16, v14
@@ -42396,10 +42397,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v35, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v30, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v33, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -42844,7 +42844,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
+define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v15f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44333,7 +44333,7 @@ end:
   ret <15 x double> %phi
 }
 
-define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v15f64_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44341,41 +44341,42 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v32, s30, 0
-; SI-NEXT:    v_writelane_b32 v32, s31, 1
-; SI-NEXT:    v_writelane_b32 v32, s34, 2
-; SI-NEXT:    v_writelane_b32 v32, s35, 3
-; SI-NEXT:    v_writelane_b32 v32, s36, 4
-; SI-NEXT:    v_writelane_b32 v32, s37, 5
-; SI-NEXT:    v_writelane_b32 v32, s38, 6
-; SI-NEXT:    v_writelane_b32 v32, s39, 7
-; SI-NEXT:    v_writelane_b32 v32, s48, 8
-; SI-NEXT:    v_writelane_b32 v32, s49, 9
-; SI-NEXT:    v_writelane_b32 v32, s50, 10
-; SI-NEXT:    v_writelane_b32 v32, s51, 11
-; SI-NEXT:    v_writelane_b32 v32, s52, 12
-; SI-NEXT:    v_writelane_b32 v32, s53, 13
-; SI-NEXT:    v_writelane_b32 v32, s54, 14
-; SI-NEXT:    v_writelane_b32 v32, s55, 15
-; SI-NEXT:    v_writelane_b32 v32, s64, 16
-; SI-NEXT:    v_writelane_b32 v32, s65, 17
-; SI-NEXT:    v_writelane_b32 v32, s66, 18
-; SI-NEXT:    v_writelane_b32 v32, s67, 19
-; SI-NEXT:    v_writelane_b32 v32, s68, 20
-; SI-NEXT:    v_writelane_b32 v32, s69, 21
-; SI-NEXT:    v_writelane_b32 v32, s70, 22
-; SI-NEXT:    v_writelane_b32 v32, s71, 23
-; SI-NEXT:    v_writelane_b32 v32, s80, 24
-; SI-NEXT:    v_writelane_b32 v32, s81, 25
-; SI-NEXT:    v_writelane_b32 v32, s82, 26
-; SI-NEXT:    v_writelane_b32 v32, s83, 27
-; SI-NEXT:    v_writelane_b32 v32, s84, 28
-; SI-NEXT:    v_writelane_b32 v32, s85, 29
-; SI-NEXT:    v_writelane_b32 v32, s86, 30
-; SI-NEXT:    v_writelane_b32 v32, s87, 31
-; SI-NEXT:    v_writelane_b32 v32, s96, 32
-; SI-NEXT:    v_writelane_b32 v32, s97, 33
-; SI-NEXT:    v_writelane_b32 v32, s98, 34
+; SI-NEXT:    v_writelane_b32 v32, s34, 0
+; SI-NEXT:    v_writelane_b32 v32, s35, 1
+; SI-NEXT:    v_writelane_b32 v32, s36, 2
+; SI-NEXT:    v_writelane_b32 v32, s37, 3
+; SI-NEXT:    v_writelane_b32 v32, s38, 4
+; SI-NEXT:    v_writelane_b32 v32, s39, 5
+; SI-NEXT:    v_writelane_b32 v32, s48, 6
+; SI-NEXT:    v_writelane_b32 v32, s49, 7
+; SI-NEXT:    v_writelane_b32 v32, s50, 8
+; SI-NEXT:    v_writelane_b32 v32, s51, 9
+; SI-NEXT:    v_writelane_b32 v32, s52, 10
+; SI-NEXT:    v_writelane_b32 v32, s53, 11
+; SI-NEXT:    v_writelane_b32 v32, s54, 12
+; SI-NEXT:    v_writelane_b32 v32, s55, 13
+; SI-NEXT:    v_writelane_b32 v32, s64, 14
+; SI-NEXT:    v_writelane_b32 v32, s65, 15
+; SI-NEXT:    v_writelane_b32 v32, s66, 16
+; SI-NEXT:    v_writelane_b32 v32, s67, 17
+; SI-NEXT:    v_writelane_b32 v32, s68, 18
+; SI-NEXT:    v_writelane_b32 v32, s69, 19
+; SI-NEXT:    v_writelane_b32 v32, s70, 20
+; SI-NEXT:    v_writelane_b32 v32, s71, 21
+; SI-NEXT:    v_writelane_b32 v32, s80, 22
+; SI-NEXT:    v_writelane_b32 v32, s81, 23
+; SI-NEXT:    v_writelane_b32 v32, s82, 24
+; SI-NEXT:    v_writelane_b32 v32, s83, 25
+; SI-NEXT:    v_writelane_b32 v32, s84, 26
+; SI-NEXT:    v_writelane_b32 v32, s85, 27
+; SI-NEXT:    v_writelane_b32 v32, s86, 28
+; SI-NEXT:    v_writelane_b32 v32, s87, 29
+; SI-NEXT:    v_writelane_b32 v32, s96, 30
+; SI-NEXT:    v_writelane_b32 v32, s97, 31
+; SI-NEXT:    v_writelane_b32 v32, s98, 32
+; SI-NEXT:    v_writelane_b32 v32, s99, 33
+; SI-NEXT:    v_writelane_b32 v32, s30, 34
+; SI-NEXT:    v_writelane_b32 v32, s31, 35
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s8, v14
 ; SI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -44392,7 +44393,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s30, v2
 ; SI-NEXT:    v_readfirstlane_b32 s35, v1
 ; SI-NEXT:    v_readfirstlane_b32 s70, v0
-; SI-NEXT:    v_writelane_b32 v32, s99, 35
 ; SI-NEXT:    s_lshr_b32 s31, s29, 16
 ; SI-NEXT:    s_lshr_b32 s68, s28, 16
 ; SI-NEXT:    s_lshr_b32 s71, s27, 16
@@ -44797,42 +44797,42 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v30, s66
 ; SI-NEXT:    v_mov_b32_e32 v31, s67
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_readlane_b32 s99, v32, 35
-; SI-NEXT:    v_readlane_b32 s98, v32, 34
-; SI-NEXT:    v_readlane_b32 s97, v32, 33
-; SI-NEXT:    v_readlane_b32 s96, v32, 32
-; SI-NEXT:    v_readlane_b32 s87, v32, 31
-; SI-NEXT:    v_readlane_b32 s86, v32, 30
-; SI-NEXT:    v_readlane_b32 s85, v32, 29
-; SI-NEXT:    v_readlane_b32 s84, v32, 28
-; SI-NEXT:    v_readlane_b32 s83, v32, 27
-; SI-NEXT:    v_readlane_b32 s82, v32, 26
-; SI-NEXT:    v_readlane_b32 s81, v32, 25
-; SI-NEXT:    v_readlane_b32 s80, v32, 24
-; SI-NEXT:    v_readlane_b32 s71, v32, 23
-; SI-NEXT:    v_readlane_b32 s70, v32, 22
-; SI-NEXT:    v_readlane_b32 s69, v32, 21
-; SI-NEXT:    v_readlane_b32 s68, v32, 20
-; SI-NEXT:    v_readlane_b32 s67, v32, 19
-; SI-NEXT:    v_readlane_b32 s66, v32, 18
-; SI-NEXT:    v_readlane_b32 s65, v32, 17
-; SI-NEXT:    v_readlane_b32 s64, v32, 16
-; SI-NEXT:    v_readlane_b32 s55, v32, 15
-; SI-NEXT:    v_readlane_b32 s54, v32, 14
-; SI-NEXT:    v_readlane_b32 s53, v32, 13
-; SI-NEXT:    v_readlane_b32 s52, v32, 12
-; SI-NEXT:    v_readlane_b32 s51, v32, 11
-; SI-NEXT:    v_readlane_b32 s50, v32, 10
-; SI-NEXT:    v_readlane_b32 s49, v32, 9
-; SI-NEXT:    v_readlane_b32 s48, v32, 8
-; SI-NEXT:    v_readlane_b32 s39, v32, 7
-; SI-NEXT:    v_readlane_b32 s38, v32, 6
-; SI-NEXT:    v_readlane_b32 s37, v32, 5
-; SI-NEXT:    v_readlane_b32 s36, v32, 4
-; SI-NEXT:    v_readlane_b32 s35, v32, 3
-; SI-NEXT:    v_readlane_b32 s34, v32, 2
-; SI-NEXT:    v_readlane_b32 s31, v32, 1
-; SI-NEXT:    v_readlane_b32 s30, v32, 0
+; SI-NEXT:    v_readlane_b32 s30, v32, 34
+; SI-NEXT:    v_readlane_b32 s31, v32, 35
+; SI-NEXT:    v_readlane_b32 s99, v32, 33
+; SI-NEXT:    v_readlane_b32 s98, v32, 32
+; SI-NEXT:    v_readlane_b32 s97, v32, 31
+; SI-NEXT:    v_readlane_b32 s96, v32, 30
+; SI-NEXT:    v_readlane_b32 s87, v32, 29
+; SI-NEXT:    v_readlane_b32 s86, v32, 28
+; SI-NEXT:    v_readlane_b32 s85, v32, 27
+; SI-NEXT:    v_readlane_b32 s84, v32, 26
+; SI-NEXT:    v_readlane_b32 s83, v32, 25
+; SI-NEXT:    v_readlane_b32 s82, v32, 24
+; SI-NEXT:    v_readlane_b32 s81, v32, 23
+; SI-NEXT:    v_readlane_b32 s80, v32, 22
+; SI-NEXT:    v_readlane_b32 s71, v32, 21
+; SI-NEXT:    v_readlane_b32 s70, v32, 20
+; SI-NEXT:    v_readlane_b32 s69, v32, 19
+; SI-NEXT:    v_readlane_b32 s68, v32, 18
+; SI-NEXT:    v_readlane_b32 s67, v32, 17
+; SI-NEXT:    v_readlane_b32 s66, v32, 16
+; SI-NEXT:    v_readlane_b32 s65, v32, 15
+; SI-NEXT:    v_readlane_b32 s64, v32, 14
+; SI-NEXT:    v_readlane_b32 s55, v32, 13
+; SI-NEXT:    v_readlane_b32 s54, v32, 12
+; SI-NEXT:    v_readlane_b32 s53, v32, 11
+; SI-NEXT:    v_readlane_b32 s52, v32, 10
+; SI-NEXT:    v_readlane_b32 s51, v32, 9
+; SI-NEXT:    v_readlane_b32 s50, v32, 8
+; SI-NEXT:    v_readlane_b32 s49, v32, 7
+; SI-NEXT:    v_readlane_b32 s48, v32, 6
+; SI-NEXT:    v_readlane_b32 s39, v32, 5
+; SI-NEXT:    v_readlane_b32 s38, v32, 4
+; SI-NEXT:    v_readlane_b32 s37, v32, 3
+; SI-NEXT:    v_readlane_b32 s36, v32, 2
+; SI-NEXT:    v_readlane_b32 s35, v32, 1
+; SI-NEXT:    v_readlane_b32 s34, v32, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -44846,53 +44846,54 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v32, s30, 0
-; VI-NEXT:    v_writelane_b32 v32, s31, 1
-; VI-NEXT:    v_writelane_b32 v32, s34, 2
-; VI-NEXT:    v_writelane_b32 v32, s35, 3
-; VI-NEXT:    v_writelane_b32 v32, s36, 4
-; VI-NEXT:    v_writelane_b32 v32, s37, 5
-; VI-NEXT:    v_writelane_b32 v32, s38, 6
-; VI-NEXT:    v_writelane_b32 v32, s39, 7
-; VI-NEXT:    v_writelane_b32 v32, s48, 8
-; VI-NEXT:    v_writelane_b32 v32, s49, 9
-; VI-NEXT:    v_writelane_b32 v32, s50, 10
-; VI-NEXT:    v_writelane_b32 v32, s51, 11
-; VI-NEXT:    v_writelane_b32 v32, s52, 12
-; VI-NEXT:    v_writelane_b32 v32, s53, 13
-; VI-NEXT:    v_writelane_b32 v32, s54, 14
-; VI-NEXT:    v_writelane_b32 v32, s55, 15
-; VI-NEXT:    v_writelane_b32 v32, s64, 16
-; VI-NEXT:    v_writelane_b32 v32, s65, 17
-; VI-NEXT:    v_writelane_b32 v32, s66, 18
-; VI-NEXT:    v_writelane_b32 v32, s67, 19
-; VI-NEXT:    v_writelane_b32 v32, s68, 20
-; VI-NEXT:    v_writelane_b32 v32, s69, 21
-; VI-NEXT:    v_writelane_b32 v32, s70, 22
-; VI-NEXT:    v_writelane_b32 v32, s71, 23
-; VI-NEXT:    v_writelane_b32 v32, s80, 24
+; VI-NEXT:    v_writelane_b32 v32, s34, 0
+; VI-NEXT:    v_writelane_b32 v32, s35, 1
+; VI-NEXT:    v_writelane_b32 v32, s36, 2
+; VI-NEXT:    v_writelane_b32 v32, s37, 3
+; VI-NEXT:    v_writelane_b32 v32, s38, 4
+; VI-NEXT:    v_writelane_b32 v32, s39, 5
+; VI-NEXT:    v_writelane_b32 v32, s48, 6
+; VI-NEXT:    v_writelane_b32 v32, s49, 7
+; VI-NEXT:    v_writelane_b32 v32, s50, 8
+; VI-NEXT:    v_writelane_b32 v32, s51, 9
+; VI-NEXT:    v_writelane_b32 v32, s52, 10
+; VI-NEXT:    v_writelane_b32 v32, s53, 11
+; VI-NEXT:    v_writelane_b32 v32, s54, 12
+; VI-NEXT:    v_writelane_b32 v32, s55, 13
+; VI-NEXT:    v_writelane_b32 v32, s64, 14
+; VI-NEXT:    v_writelane_b32 v32, s65, 15
+; VI-NEXT:    v_writelane_b32 v32, s66, 16
+; VI-NEXT:    v_writelane_b32 v32, s67, 17
+; VI-NEXT:    v_writelane_b32 v32, s68, 18
+; VI-NEXT:    v_writelane_b32 v32, s69, 19
+; VI-NEXT:    v_writelane_b32 v32, s70, 20
+; VI-NEXT:    v_writelane_b32 v32, s71, 21
+; VI-NEXT:    v_writelane_b32 v32, s80, 22
+; VI-NEXT:    v_writelane_b32 v32, s81, 23
+; VI-NEXT:    v_writelane_b32 v32, s82, 24
+; VI-NEXT:    v_writelane_b32 v32, s83, 25
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
-; VI-NEXT:    v_writelane_b32 v32, s81, 25
+; VI-NEXT:    v_writelane_b32 v32, s84, 26
 ; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 16
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; VI-NEXT:    v_writelane_b32 v32, s82, 26
+; VI-NEXT:    v_writelane_b32 v32, s85, 27
 ; VI-NEXT:    s_lshr_b32 vcc_hi, s8, 16
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
 ; VI-NEXT:    v_writelane_b32 v33, vcc_lo, 0
-; VI-NEXT:    v_writelane_b32 v32, s83, 27
+; VI-NEXT:    v_writelane_b32 v32, s86, 28
 ; VI-NEXT:    s_lshr_b32 s63, s10, 16
 ; VI-NEXT:    v_readfirstlane_b32 s12, v12
 ; VI-NEXT:    v_writelane_b32 v33, vcc_hi, 1
-; VI-NEXT:    v_writelane_b32 v32, s84, 28
+; VI-NEXT:    v_writelane_b32 v32, s87, 29
 ; VI-NEXT:    s_lshr_b32 s62, s12, 16
 ; VI-NEXT:    v_readfirstlane_b32 s14, v11
 ; VI-NEXT:    v_writelane_b32 v33, s63, 2
-; VI-NEXT:    v_writelane_b32 v32, s85, 29
+; VI-NEXT:    v_writelane_b32 v32, s30, 30
 ; VI-NEXT:    s_lshr_b32 s61, s14, 16
 ; VI-NEXT:    v_readfirstlane_b32 s72, v10
 ; VI-NEXT:    v_writelane_b32 v33, s62, 3
-; VI-NEXT:    v_writelane_b32 v32, s86, 30
+; VI-NEXT:    v_writelane_b32 v32, s31, 31
 ; VI-NEXT:    s_lshr_b32 s60, s72, 16
 ; VI-NEXT:    v_readfirstlane_b32 s74, v9
 ; VI-NEXT:    v_readfirstlane_b32 s76, v8
@@ -44905,7 +44906,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; VI-NEXT:    v_readfirstlane_b32 s85, v1
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
 ; VI-NEXT:    v_writelane_b32 v33, s61, 4
-; VI-NEXT:    v_writelane_b32 v32, s87, 31
 ; VI-NEXT:    s_lshr_b32 s56, s29, 16
 ; VI-NEXT:    s_lshr_b32 s88, s28, 16
 ; VI-NEXT:    s_lshr_b32 s31, s27, 16
@@ -45204,38 +45204,38 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s66
 ; VI-NEXT:    v_mov_b32_e32 v31, s67
 ; VI-NEXT:  .LBB55_5: ; %end
-; VI-NEXT:    v_readlane_b32 s87, v32, 31
-; VI-NEXT:    v_readlane_b32 s86, v32, 30
-; VI-NEXT:    v_readlane_b32 s85, v32, 29
-; VI-NEXT:    v_readlane_b32 s84, v32, 28
-; VI-NEXT:    v_readlane_b32 s83, v32, 27
-; VI-NEXT:    v_readlane_b32 s82, v32, 26
-; VI-NEXT:    v_readlane_b32 s81, v32, 25
-; VI-NEXT:    v_readlane_b32 s80, v32, 24
-; VI-NEXT:    v_readlane_b32 s71, v32, 23
-; VI-NEXT:    v_readlane_b32 s70, v32, 22
-; VI-NEXT:    v_readlane_b32 s69, v32, 21
-; VI-NEXT:    v_readlane_b32 s68, v32, 20
-; VI-NEXT:    v_readlane_b32 s67, v32, 19
-; VI-NEXT:    v_readlane_b32 s66, v32, 18
-; VI-NEXT:    v_readlane_b32 s65, v32, 17
-; VI-NEXT:    v_readlane_b32 s64, v32, 16
-; VI-NEXT:    v_readlane_b32 s55, v32, 15
-; VI-NEXT:    v_readlane_b32 s54, v32, 14
-; VI-NEXT:    v_readlane_b32 s53, v32, 13
-; VI-NEXT:    v_readlane_b32 s52, v32, 12
-; VI-NEXT:    v_readlane_b32 s51, v32, 11
-; VI-NEXT:    v_readlane_b32 s50, v32, 10
-; VI-NEXT:    v_readlane_b32 s49, v32, 9
-; VI-NEXT:    v_readlane_b32 s48, v32, 8
-; VI-NEXT:    v_readlane_b32 s39, v32, 7
-; VI-NEXT:    v_readlane_b32 s38, v32, 6
-; VI-NEXT:    v_readlane_b32 s37, v32, 5
-; VI-NEXT:    v_readlane_b32 s36, v32, 4
-; VI-NEXT:    v_readlane_b32 s35, v32, 3
-; VI-NEXT:    v_readlane_b32 s34, v32, 2
-; VI-NEXT:    v_readlane_b32 s31, v32, 1
-; VI-NEXT:    v_readlane_b32 s30, v32, 0
+; VI-NEXT:    v_readlane_b32 s30, v32, 30
+; VI-NEXT:    v_readlane_b32 s31, v32, 31
+; VI-NEXT:    v_readlane_b32 s87, v32, 29
+; VI-NEXT:    v_readlane_b32 s86, v32, 28
+; VI-NEXT:    v_readlane_b32 s85, v32, 27
+; VI-NEXT:    v_readlane_b32 s84, v32, 26
+; VI-NEXT:    v_readlane_b32 s83, v32, 25
+; VI-NEXT:    v_readlane_b32 s82, v32, 24
+; VI-NEXT:    v_readlane_b32 s81, v32, 23
+; VI-NEXT:    v_readlane_b32 s80, v32, 22
+; VI-NEXT:    v_readlane_b32 s71, v32, 21
+; VI-NEXT:    v_readlane_b32 s70, v32, 20
+; VI-NEXT:    v_readlane_b32 s69, v32, 19
+; VI-NEXT:    v_readlane_b32 s68, v32, 18
+; VI-NEXT:    v_readlane_b32 s67, v32, 17
+; VI-NEXT:    v_readlane_b32 s66, v32, 16
+; VI-NEXT:    v_readlane_b32 s65, v32, 15
+; VI-NEXT:    v_readlane_b32 s64, v32, 14
+; VI-NEXT:    v_readlane_b32 s55, v32, 13
+; VI-NEXT:    v_readlane_b32 s54, v32, 12
+; VI-NEXT:    v_readlane_b32 s53, v32, 11
+; VI-NEXT:    v_readlane_b32 s52, v32, 10
+; VI-NEXT:    v_readlane_b32 s51, v32, 9
+; VI-NEXT:    v_readlane_b32 s50, v32, 8
+; VI-NEXT:    v_readlane_b32 s49, v32, 7
+; VI-NEXT:    v_readlane_b32 s48, v32, 6
+; VI-NEXT:    v_readlane_b32 s39, v32, 5
+; VI-NEXT:    v_readlane_b32 s38, v32, 4
+; VI-NEXT:    v_readlane_b32 s37, v32, 3
+; VI-NEXT:    v_readlane_b32 s36, v32, 2
+; VI-NEXT:    v_readlane_b32 s35, v32, 1
+; VI-NEXT:    v_readlane_b32 s34, v32, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -45584,7 +45584,7 @@ end:
   ret <15 x double> %phi
 }
 
-define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
+define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v60f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47057,7 +47057,7 @@ end:
   ret <60 x half> %phi
 }
 
-define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60i16_to_v60f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47066,54 +47066,53 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v30, s30, 0
-; SI-NEXT:    v_writelane_b32 v30, s31, 1
-; SI-NEXT:    v_writelane_b32 v30, s34, 2
-; SI-NEXT:    v_writelane_b32 v30, s35, 3
-; SI-NEXT:    v_writelane_b32 v30, s36, 4
-; SI-NEXT:    v_writelane_b32 v30, s37, 5
-; SI-NEXT:    v_writelane_b32 v30, s38, 6
-; SI-NEXT:    v_writelane_b32 v30, s39, 7
-; SI-NEXT:    v_writelane_b32 v30, s48, 8
-; SI-NEXT:    v_writelane_b32 v30, s49, 9
-; SI-NEXT:    v_writelane_b32 v30, s50, 10
-; SI-NEXT:    v_writelane_b32 v30, s51, 11
-; SI-NEXT:    v_writelane_b32 v30, s52, 12
-; SI-NEXT:    v_writelane_b32 v30, s53, 13
-; SI-NEXT:    v_writelane_b32 v30, s54, 14
-; SI-NEXT:    v_writelane_b32 v30, s55, 15
-; SI-NEXT:    v_writelane_b32 v30, s64, 16
-; SI-NEXT:    v_writelane_b32 v30, s65, 17
-; SI-NEXT:    v_writelane_b32 v30, s66, 18
-; SI-NEXT:    v_writelane_b32 v30, s67, 19
-; SI-NEXT:    v_writelane_b32 v30, s68, 20
-; SI-NEXT:    v_writelane_b32 v30, s69, 21
-; SI-NEXT:    v_writelane_b32 v30, s70, 22
-; SI-NEXT:    v_writelane_b32 v30, s71, 23
-; SI-NEXT:    v_writelane_b32 v30, s80, 24
-; SI-NEXT:    v_writelane_b32 v30, s81, 25
-; SI-NEXT:    v_writelane_b32 v30, s82, 26
-; SI-NEXT:    v_writelane_b32 v30, s83, 27
-; SI-NEXT:    v_writelane_b32 v30, s84, 28
+; SI-NEXT:    v_writelane_b32 v30, s34, 0
+; SI-NEXT:    v_writelane_b32 v30, s35, 1
+; SI-NEXT:    v_writelane_b32 v30, s36, 2
+; SI-NEXT:    v_writelane_b32 v30, s37, 3
+; SI-NEXT:    v_writelane_b32 v30, s38, 4
+; SI-NEXT:    v_writelane_b32 v30, s39, 5
+; SI-NEXT:    v_writelane_b32 v30, s48, 6
+; SI-NEXT:    v_writelane_b32 v30, s49, 7
+; SI-NEXT:    v_writelane_b32 v30, s50, 8
+; SI-NEXT:    v_writelane_b32 v30, s51, 9
+; SI-NEXT:    v_writelane_b32 v30, s52, 10
+; SI-NEXT:    v_writelane_b32 v30, s53, 11
+; SI-NEXT:    v_writelane_b32 v30, s54, 12
+; SI-NEXT:    v_writelane_b32 v30, s55, 13
+; SI-NEXT:    v_writelane_b32 v30, s64, 14
+; SI-NEXT:    v_writelane_b32 v30, s65, 15
+; SI-NEXT:    v_writelane_b32 v30, s66, 16
+; SI-NEXT:    v_writelane_b32 v30, s67, 17
+; SI-NEXT:    v_writelane_b32 v30, s68, 18
+; SI-NEXT:    v_writelane_b32 v30, s69, 19
+; SI-NEXT:    v_writelane_b32 v30, s70, 20
+; SI-NEXT:    v_writelane_b32 v30, s71, 21
+; SI-NEXT:    v_writelane_b32 v30, s80, 22
+; SI-NEXT:    v_writelane_b32 v30, s81, 23
+; SI-NEXT:    v_writelane_b32 v30, s82, 24
+; SI-NEXT:    v_writelane_b32 v30, s83, 25
+; SI-NEXT:    v_writelane_b32 v30, s84, 26
+; SI-NEXT:    v_writelane_b32 v30, s85, 27
 ; SI-NEXT:    s_lshr_b32 s4, s19, 16
 ; SI-NEXT:    ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v30, s85, 29
+; SI-NEXT:    v_writelane_b32 v30, s86, 28
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v31, s4, 0
 ; SI-NEXT:    s_lshr_b32 s4, s18, 16
-; SI-NEXT:    v_writelane_b32 v30, s86, 30
+; SI-NEXT:    v_writelane_b32 v30, s87, 29
 ; SI-NEXT:    v_writelane_b32 v31, s4, 1
-; SI-NEXT:    v_writelane_b32 v30, s87, 31
+; SI-NEXT:    v_writelane_b32 v30, s96, 30
 ; SI-NEXT:    v_writelane_b32 v31, s17, 2
 ; SI-NEXT:    s_lshr_b32 s4, s17, 16
-; SI-NEXT:    v_writelane_b32 v30, s96, 32
+; SI-NEXT:    v_writelane_b32 v30, s97, 31
 ; SI-NEXT:    v_writelane_b32 v31, s4, 3
-; SI-NEXT:    v_writelane_b32 v30, s97, 33
+; SI-NEXT:    v_writelane_b32 v30, s98, 32
 ; SI-NEXT:    v_writelane_b32 v31, s16, 4
 ; SI-NEXT:    s_lshr_b32 s4, s16, 16
-; SI-NEXT:    v_writelane_b32 v30, s98, 34
+; SI-NEXT:    v_writelane_b32 v30, s99, 33
 ; SI-NEXT:    v_writelane_b32 v31, s4, 5
-; SI-NEXT:    v_writelane_b32 v30, s99, 35
+; SI-NEXT:    v_writelane_b32 v30, s30, 34
 ; SI-NEXT:    v_readfirstlane_b32 s75, v15
 ; SI-NEXT:    v_readfirstlane_b32 s70, v14
 ; SI-NEXT:    v_readfirstlane_b32 s17, v13
@@ -47131,6 +47130,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s97, v1
 ; SI-NEXT:    v_readfirstlane_b32 s99, v0
 ; SI-NEXT:    v_writelane_b32 v31, s18, 6
+; SI-NEXT:    v_writelane_b32 v30, s31, 35
 ; SI-NEXT:    s_mov_b32 s85, s21
 ; SI-NEXT:    s_lshr_b32 s79, s29, 16
 ; SI-NEXT:    s_lshr_b32 s92, s28, 16
@@ -47692,6 +47692,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_lshl_b32 s44, s86, 16
 ; SI-NEXT:    s_or_b32 s5, s5, s44
+; SI-NEXT:    v_readlane_b32 s30, v30, 34
 ; SI-NEXT:    v_readlane_b32 s47, v31, 15
 ; SI-NEXT:    v_readlane_b32 s45, v31, 25
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
@@ -47724,42 +47725,41 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v27, s7
 ; SI-NEXT:    v_mov_b32_e32 v28, s4
 ; SI-NEXT:    v_mov_b32_e32 v29, s5
-; SI-NEXT:    v_readlane_b32 s99, v30, 35
-; SI-NEXT:    v_readlane_b32 s98, v30, 34
-; SI-NEXT:    v_readlane_b32 s97, v30, 33
-; SI-NEXT:    v_readlane_b32 s96, v30, 32
-; SI-NEXT:    v_readlane_b32 s87, v30, 31
-; SI-NEXT:    v_readlane_b32 s86, v30, 30
-; SI-NEXT:    v_readlane_b32 s85, v30, 29
-; SI-NEXT:    v_readlane_b32 s84, v30, 28
-; SI-NEXT:    v_readlane_b32 s83, v30, 27
-; SI-NEXT:    v_readlane_b32 s82, v30, 26
-; SI-NEXT:    v_readlane_b32 s81, v30, 25
-; SI-NEXT:    v_readlane_b32 s80, v30, 24
-; SI-NEXT:    v_readlane_b32 s71, v30, 23
-; SI-NEXT:    v_readlane_b32 s70, v30, 22
-; SI-NEXT:    v_readlane_b32 s69, v30, 21
-; SI-NEXT:    v_readlane_b32 s68, v30, 20
-; SI-NEXT:    v_readlane_b32 s67, v30, 19
-; SI-NEXT:    v_readlane_b32 s66, v30, 18
-; SI-NEXT:    v_readlane_b32 s65, v30, 17
-; SI-NEXT:    v_readlane_b32 s64, v30, 16
-; SI-NEXT:    v_readlane_b32 s55, v30, 15
-; SI-NEXT:    v_readlane_b32 s54, v30, 14
-; SI-NEXT:    v_readlane_b32 s53, v30, 13
-; SI-NEXT:    v_readlane_b32 s52, v30, 12
-; SI-NEXT:    v_readlane_b32 s51, v30, 11
-; SI-NEXT:    v_readlane_b32 s50, v30, 10
-; SI-NEXT:    v_readlane_b32 s49, v30, 9
-; SI-NEXT:    v_readlane_b32 s48, v30, 8
-; SI-NEXT:    v_readlane_b32 s39, v30, 7
-; SI-NEXT:    v_readlane_b32 s38, v30, 6
-; SI-NEXT:    v_readlane_b32 s37, v30, 5
-; SI-NEXT:    v_readlane_b32 s36, v30, 4
-; SI-NEXT:    v_readlane_b32 s35, v30, 3
-; SI-NEXT:    v_readlane_b32 s34, v30, 2
-; SI-NEXT:    v_readlane_b32 s31, v30, 1
-; SI-NEXT:    v_readlane_b32 s30, v30, 0
+; SI-NEXT:    v_readlane_b32 s31, v30, 35
+; SI-NEXT:    v_readlane_b32 s99, v30, 33
+; SI-NEXT:    v_readlane_b32 s98, v30, 32
+; SI-NEXT:    v_readlane_b32 s97, v30, 31
+; SI-NEXT:    v_readlane_b32 s96, v30, 30
+; SI-NEXT:    v_readlane_b32 s87, v30, 29
+; SI-NEXT:    v_readlane_b32 s86, v30, 28
+; SI-NEXT:    v_readlane_b32 s85, v30, 27
+; SI-NEXT:    v_readlane_b32 s84, v30, 26
+; SI-NEXT:    v_readlane_b32 s83, v30, 25
+; SI-NEXT:    v_readlane_b32 s82, v30, 24
+; SI-NEXT:    v_readlane_b32 s81, v30, 23
+; SI-NEXT:    v_readlane_b32 s80, v30, 22
+; SI-NEXT:    v_readlane_b32 s71, v30, 21
+; SI-NEXT:    v_readlane_b32 s70, v30, 20
+; SI-NEXT:    v_readlane_b32 s69, v30, 19
+; SI-NEXT:    v_readlane_b32 s68, v30, 18
+; SI-NEXT:    v_readlane_b32 s67, v30, 17
+; SI-NEXT:    v_readlane_b32 s66, v30, 16
+; SI-NEXT:    v_readlane_b32 s65, v30, 15
+; SI-NEXT:    v_readlane_b32 s64, v30, 14
+; SI-NEXT:    v_readlane_b32 s55, v30, 13
+; SI-NEXT:    v_readlane_b32 s54, v30, 12
+; SI-NEXT:    v_readlane_b32 s53, v30, 11
+; SI-NEXT:    v_readlane_b32 s52, v30, 10
+; SI-NEXT:    v_readlane_b32 s51, v30, 9
+; SI-NEXT:    v_readlane_b32 s50, v30, 8
+; SI-NEXT:    v_readlane_b32 s49, v30, 7
+; SI-NEXT:    v_readlane_b32 s48, v30, 6
+; SI-NEXT:    v_readlane_b32 s39, v30, 5
+; SI-NEXT:    v_readlane_b32 s38, v30, 4
+; SI-NEXT:    v_readlane_b32 s37, v30, 3
+; SI-NEXT:    v_readlane_b32 s36, v30, 2
+; SI-NEXT:    v_readlane_b32 s35, v30, 1
+; SI-NEXT:    v_readlane_b32 s34, v30, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -47821,13 +47821,14 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v30, s30, 0
-; VI-NEXT:    v_writelane_b32 v30, s31, 1
-; VI-NEXT:    v_writelane_b32 v30, s34, 2
-; VI-NEXT:    v_writelane_b32 v30, s35, 3
-; VI-NEXT:    v_writelane_b32 v30, s36, 4
-; VI-NEXT:    v_writelane_b32 v30, s37, 5
-; VI-NEXT:    v_writelane_b32 v30, s38, 6
+; VI-NEXT:    v_writelane_b32 v30, s34, 0
+; VI-NEXT:    v_writelane_b32 v30, s35, 1
+; VI-NEXT:    v_writelane_b32 v30, s36, 2
+; VI-NEXT:    v_writelane_b32 v30, s37, 3
+; VI-NEXT:    v_writelane_b32 v30, s38, 4
+; VI-NEXT:    v_writelane_b32 v30, s39, 5
+; VI-NEXT:    v_writelane_b32 v30, s30, 6
+; VI-NEXT:    v_writelane_b32 v30, s31, 7
 ; VI-NEXT:    v_readfirstlane_b32 s7, v15
 ; VI-NEXT:    v_readfirstlane_b32 s8, v14
 ; VI-NEXT:    v_readfirstlane_b32 s10, v13
@@ -47844,7 +47845,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s89, v2
 ; VI-NEXT:    v_readfirstlane_b32 s30, v1
 ; VI-NEXT:    v_readfirstlane_b32 s35, v0
-; VI-NEXT:    v_writelane_b32 v30, s39, 7
 ; VI-NEXT:    s_lshr_b32 s44, s29, 16
 ; VI-NEXT:    s_lshr_b32 s47, s28, 16
 ; VI-NEXT:    s_lshr_b32 s58, s27, 16
@@ -48032,6 +48032,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; VI-NEXT:    s_or_b32 s10, s10, s11
 ; VI-NEXT:    s_or_b32 s8, s8, s9
 ; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    v_readlane_b32 s30, v30, 6
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -48062,14 +48063,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v27, s10
 ; VI-NEXT:    v_mov_b32_e32 v28, s8
 ; VI-NEXT:    v_mov_b32_e32 v29, s6
-; VI-NEXT:    v_readlane_b32 s39, v30, 7
-; VI-NEXT:    v_readlane_b32 s38, v30, 6
-; VI-NEXT:    v_readlane_b32 s37, v30, 5
-; VI-NEXT:    v_readlane_b32 s36, v30, 4
-; VI-NEXT:    v_readlane_b32 s35, v30, 3
-; VI-NEXT:    v_readlane_b32 s34, v30, 2
-; VI-NEXT:    v_readlane_b32 s31, v30, 1
-; VI-NEXT:    v_readlane_b32 s30, v30, 0
+; VI-NEXT:    v_readlane_b32 s31, v30, 7
+; VI-NEXT:    v_readlane_b32 s39, v30, 5
+; VI-NEXT:    v_readlane_b32 s38, v30, 4
+; VI-NEXT:    v_readlane_b32 s37, v30, 3
+; VI-NEXT:    v_readlane_b32 s36, v30, 2
+; VI-NEXT:    v_readlane_b32 s35, v30, 1
+; VI-NEXT:    v_readlane_b32 s34, v30, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -48084,10 +48084,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    v_readfirstlane_b32 s35, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s34, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s31, v13
@@ -48361,6 +48361,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v55, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v53, 16, v14
@@ -48379,10 +48380,9 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v32, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v31, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v30, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -48857,7 +48857,7 @@ end:
   ret <60 x half> %phi
 }
 
-define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
+define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v60i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49884,7 +49884,7 @@ end:
   ret <60 x i16> %phi
 }
 
-define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v60f16_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49892,9 +49892,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s34, 0
+; SI-NEXT:    v_writelane_b32 v63, s35, 1
+; SI-NEXT:    v_writelane_b32 v63, s30, 2
+; SI-NEXT:    v_writelane_b32 v63, s31, 3
 ; SI-NEXT:    v_readfirstlane_b32 s6, v15
 ; SI-NEXT:    v_readfirstlane_b32 s7, v14
 ; SI-NEXT:    v_readfirstlane_b32 s47, v13
@@ -49911,7 +49912,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s94, v2
 ; SI-NEXT:    v_readfirstlane_b32 s40, v1
 ; SI-NEXT:    v_readfirstlane_b32 s30, v0
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
 ; SI-NEXT:    s_lshr_b32 s15, s29, 16
 ; SI-NEXT:    s_lshr_b32 s79, s28, 16
 ; SI-NEXT:    s_lshr_b32 s14, s27, 16
@@ -50420,10 +50420,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s30, v63, 2
+; SI-NEXT:    v_readlane_b32 s31, v63, 3
+; SI-NEXT:    v_readlane_b32 s35, v63, 1
+; SI-NEXT:    v_readlane_b32 s34, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v26
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
@@ -50466,13 +50466,14 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v60, s30, 0
-; VI-NEXT:    v_writelane_b32 v60, s31, 1
-; VI-NEXT:    v_writelane_b32 v60, s34, 2
-; VI-NEXT:    v_writelane_b32 v60, s35, 3
-; VI-NEXT:    v_writelane_b32 v60, s36, 4
-; VI-NEXT:    v_writelane_b32 v60, s37, 5
-; VI-NEXT:    v_writelane_b32 v60, s38, 6
+; VI-NEXT:    v_writelane_b32 v60, s34, 0
+; VI-NEXT:    v_writelane_b32 v60, s35, 1
+; VI-NEXT:    v_writelane_b32 v60, s36, 2
+; VI-NEXT:    v_writelane_b32 v60, s37, 3
+; VI-NEXT:    v_writelane_b32 v60, s38, 4
+; VI-NEXT:    v_writelane_b32 v60, s39, 5
+; VI-NEXT:    v_writelane_b32 v60, s30, 6
+; VI-NEXT:    v_writelane_b32 v60, s31, 7
 ; VI-NEXT:    v_readfirstlane_b32 s44, v15
 ; VI-NEXT:    v_readfirstlane_b32 s46, v14
 ; VI-NEXT:    v_readfirstlane_b32 s56, v13
@@ -50489,7 +50490,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_readfirstlane_b32 s34, v2
 ; VI-NEXT:    v_readfirstlane_b32 s36, v1
 ; VI-NEXT:    v_readfirstlane_b32 s38, v0
-; VI-NEXT:    v_writelane_b32 v60, s39, 7
 ; VI-NEXT:    s_lshr_b32 s6, s29, 16
 ; VI-NEXT:    s_lshr_b32 s7, s28, 16
 ; VI-NEXT:    s_lshr_b32 s8, s27, 16
@@ -50718,6 +50718,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; VI-NEXT:    v_readlane_b32 s30, v60, 6
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -50736,14 +50737,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s39, v60, 7
-; VI-NEXT:    v_readlane_b32 s38, v60, 6
-; VI-NEXT:    v_readlane_b32 s37, v60, 5
-; VI-NEXT:    v_readlane_b32 s36, v60, 4
-; VI-NEXT:    v_readlane_b32 s35, v60, 3
-; VI-NEXT:    v_readlane_b32 s34, v60, 2
-; VI-NEXT:    v_readlane_b32 s31, v60, 1
-; VI-NEXT:    v_readlane_b32 s30, v60, 0
+; VI-NEXT:    v_readlane_b32 s31, v60, 7
+; VI-NEXT:    v_readlane_b32 s39, v60, 5
+; VI-NEXT:    v_readlane_b32 s38, v60, 4
+; VI-NEXT:    v_readlane_b32 s37, v60, 3
+; VI-NEXT:    v_readlane_b32 s36, v60, 2
+; VI-NEXT:    v_readlane_b32 s35, v60, 1
+; VI-NEXT:    v_readlane_b32 s34, v60, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -50756,10 +50756,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v60, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v60, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v60, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v60, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v60, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v60, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v60, s31, 3
 ; GFX9-NEXT:    v_readfirstlane_b32 s35, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s34, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s31, v13
@@ -51034,6 +51034,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff, v28
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX9-NEXT:    v_readlane_b32 s30, v60, 2
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v55, 16, v12
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v53, 16, v14
@@ -51052,10 +51053,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v32, 16, v27
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v31, 16, v28
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v30, 16, v29
-; GFX9-NEXT:    v_readlane_b32 s35, v60, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v60, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v60, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v60, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v60, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v60, 0
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -51529,3 +51529,5 @@ end:
   %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <60 x i16> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 23a43ff3b2f87..a1c0db086bf3c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
-define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) {
+define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -84,7 +84,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v3i32_to_v3f32_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v3i32_to_v3f32_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -179,7 +179,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) {
+define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -256,7 +256,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v3f32_to_v3i32_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v3f32_to_v3i32_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -354,7 +354,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) {
+define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -625,7 +625,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -858,7 +858,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
+define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1284,7 +1284,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1571,7 +1571,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1684,7 +1684,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1806,7 +1806,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
+define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2153,7 +2153,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2535,7 +2535,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) {
+define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2633,7 +2633,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2746,7 +2746,7 @@ end:
   ret <6 x half> %phi
 }
 
-define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) {
+define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2884,7 +2884,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3030,7 +3030,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) {
+define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3128,7 +3128,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3i32_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3241,7 +3241,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) {
+define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v3i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3366,7 +3366,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v3i32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3499,7 +3499,7 @@ end:
   ret <3 x i32> %phi
 }
 
-define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) {
+define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3768,7 +3768,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4021,7 +4021,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
+define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4447,7 +4447,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4734,7 +4734,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4846,7 +4846,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4980,7 +4980,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
+define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5327,7 +5327,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5709,7 +5709,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) {
+define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5806,7 +5806,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5928,7 +5928,7 @@ end:
   ret <6 x half> %phi
 }
 
-define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) {
+define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6066,7 +6066,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6212,7 +6212,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) {
+define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6309,7 +6309,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v3f32_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6431,7 +6431,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) {
+define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6556,7 +6556,7 @@ end:
   ret <3 x float> %phi
 }
 
-define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v3f32_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6689,7 +6689,7 @@ end:
   ret <3 x float> %phi
 }
 
-define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7134,7 +7134,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7438,7 +7438,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
+define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7961,7 +7961,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8515,7 +8515,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
+define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8958,7 +8958,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9265,7 +9265,7 @@ end:
   ret <6 x half> %phi
 }
 
-define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) {
+define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9582,7 +9582,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9878,7 +9878,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
+define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10321,7 +10321,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v12i8_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10628,7 +10628,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) {
+define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v12i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10938,7 +10938,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11215,7 +11215,7 @@ end:
   ret <12 x i8> %phi
 }
 
-define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) {
+define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11577,7 +11577,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11975,7 +11975,7 @@ end:
   ret <6 x half> %phi
 }
 
-define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12119,7 +12119,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12285,7 +12285,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
+define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12640,7 +12640,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6bf16_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13017,7 +13017,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v6bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13148,7 +13148,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v6bf16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13299,7 +13299,7 @@ end:
   ret <6 x bfloat> %phi
 }
 
-define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) {
+define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v6i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13421,7 +13421,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6f16_to_v6i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13573,7 +13573,7 @@ end:
   ret <6 x i16> %phi
 }
 
-define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) {
+define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v6f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13706,7 +13706,7 @@ end:
   ret <6 x half> %phi
 }
 
-define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
 ; SI-LABEL: bitcast_v6i16_to_v6f16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13855,3 +13855,5 @@ end:
   %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret <6 x half> %phi
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
index 2b48cf0f41c88..7e9f825e298c7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
 
-define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bitcast_i8ptr_v16i8ptr:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -66,3 +66,5 @@ entry:
   store <16 x i8> %0, ptr addrspace(1) %out
   ret void
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index afe0971088bc1..a4ed384842956 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -17,7 +17,7 @@ declare hidden half @_Z4pownDhi(half, i32)
 ; test pow
 ; --------------------------------------------------------------------
 
-define half @test_pow_fast_f16(half %x, half %y) {
+define half @test_pow_fast_f16(half %x, half %y) #0 {
 ; CHECK-LABEL: test_pow_fast_f16:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29,7 +29,7 @@ define half @test_pow_fast_f16(half %x, half %y) {
   ret half %pow
 }
 
-define float @test_pow_fast_f32(float %x, float %y) {
+define float @test_pow_fast_f32(float %x, float %y) #0 {
 ; CHECK-LABEL: test_pow_fast_f32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43,7 +43,7 @@ define float @test_pow_fast_f32(float %x, float %y) {
   ret float %pow
 }
 
-define double @test_pow_fast_f64(double %x, double %y) {
+define double @test_pow_fast_f64(double %x, double %y) #0 {
 ; CHECK-LABEL: test_pow_fast_f64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -55,7 +55,7 @@ define double @test_pow_fast_f64(double %x, double %y) {
   ret double %pow
 }
 
-define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
+define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) #0 {
 ; CHECK-LABEL: test_pow_fast_f16__integral_y:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -76,7 +76,7 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
   ret half %pow
 }
 
-define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
+define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) #0 {
 ; CHECK-LABEL: test_pow_fast_f32__integral_y:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -109,7 +109,7 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
   ret float %pow
 }
 
-define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
+define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) #0 {
 ; CHECK-LABEL: test_pow_fast_f64__integral_y:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -119,32 +119,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s50, s15
@@ -178,21 +178,21 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
 ; CHECK-NEXT:    v_or_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -210,7 +210,7 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; test powr
 ; --------------------------------------------------------------------
 
-define half @test_powr_fast_f16(half %x, half %y) {
+define half @test_powr_fast_f16(half %x, half %y) #0 {
 ; CHECK-LABEL: test_powr_fast_f16:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -222,7 +222,7 @@ define half @test_powr_fast_f16(half %x, half %y) {
   ret half %powr
 }
 
-define float @test_powr_fast_f32(float %x, float %y) {
+define float @test_powr_fast_f32(float %x, float %y) #0 {
 ; CHECK-LABEL: test_powr_fast_f32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -249,7 +249,7 @@ define float @test_powr_fast_f32(float %x, float %y) {
   ret float %powr
 }
 
-define double @test_powr_fast_f64(double %x, double %y) {
+define double @test_powr_fast_f64(double %x, double %y) #0 {
 ; CHECK-LABEL: test_powr_fast_f64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -259,30 +259,30 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v2
@@ -314,20 +314,20 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -344,7 +344,7 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; test pown
 ; --------------------------------------------------------------------
 
-define half @test_pown_fast_f16(half %x, i32 %y) {
+define half @test_pown_fast_f16(half %x, i32 %y) #0 {
 ; CHECK-LABEL: test_pown_fast_f16:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -361,7 +361,7 @@ define half @test_pown_fast_f16(half %x, i32 %y) {
   ret half %call
 }
 
-define float @test_pown_fast_f32(float %x, i32 %y) {
+define float @test_pown_fast_f32(float %x, i32 %y) #0 {
 ; CHECK-LABEL: test_pown_fast_f32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -391,7 +391,7 @@ define float @test_pown_fast_f32(float %x, i32 %y) {
   ret float %call
 }
 
-define double @test_pown_fast_f64(double %x, i32 %y) {
+define double @test_pown_fast_f64(double %x, i32 %y) #0 {
 ; CHECK-LABEL: test_pown_fast_f64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -401,32 +401,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s50, s15
@@ -460,21 +460,21 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 12
 ; CHECK-NEXT:    v_or_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -487,7 +487,7 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
   ret double %call
 }
 
-define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
+define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) #0 {
 ; CHECK-LABEL: test_pown_fast_f16_known_even:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -503,7 +503,7 @@ define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
   ret half %call
 }
 
-define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
+define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) #0 {
 ; CHECK-LABEL: test_pown_fast_f32_known_even:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -533,7 +533,7 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
   ret float %call
 }
 
-define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
+define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) #0 {
 ; CHECK-LABEL: test_pown_fast_f64_known_even:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -543,30 +543,30 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v42, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v42, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v42, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v42, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v42, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v42, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v42, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v42, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v42, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v42, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v42, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v42, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v42, s39, 5
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v42, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v42, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v42, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v42, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v42, s50, 8
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v42, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v42, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v42, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v42, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v42, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v42, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v42, s30, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v42, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v42, s31, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -597,20 +597,20 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s53, v42, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v42, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v42, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v42, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v42, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v42, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v42, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v42, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v42, 12
+; CHECK-NEXT:    v_readlane_b32 s31, v42, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v42, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v42, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v42, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v42, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v42, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v42, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v42, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v42, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v42, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v42, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v42, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v42, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -624,7 +624,7 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
   ret double %call
 }
 
-define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
+define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) #0 {
 ; CHECK-LABEL: test_pown_fast_f16_known_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -642,7 +642,7 @@ define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
   ret half %call
 }
 
-define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
+define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) #0 {
 ; CHECK-LABEL: test_pown_fast_f32_known_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -674,7 +674,7 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
   ret float %call
 }
 
-define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
+define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) #0 {
 ; CHECK-LABEL: test_pown_fast_f64_known_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -684,33 +684,33 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v43, s16, 15
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s54, 12
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 13
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT:    v_writelane_b32 v43, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 14
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -744,21 +744,21 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s54, v43, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 15
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -771,3 +771,5 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 554f40b5bfdfa..4253ed138a8cb 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -214,8 +214,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -242,8 +242,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX8-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX8-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX8-ARCH-FLAT-NEXT:    s_add_i32 s3, s33, 8
@@ -270,8 +270,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -297,8 +297,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX9-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX9-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX9-ARCH-FLAT-NEXT:    scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -321,11 +321,12 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX942-ARCH-FLAT-NEXT:    s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
 ; GFX942-ARCH-FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX942-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX942-ARCH-FLAT-NEXT:    s_nop 1
 ; GFX942-ARCH-FLAT-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX942-ARCH-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-ARCH-FLAT-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX942-ARCH-FLAT-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX942-ARCH-FLAT-NEXT:    s_mov_b32 s32, s33
 ; GFX942-ARCH-FLAT-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-ARCH-FLAT-NEXT:    scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -352,8 +353,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -867,4 +868,4 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
   ret void
 }
 
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
+attributes #0 = { "amdgpu-no-flat-scratch-init" nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index a066211e56b13..b8c87555ea9bc 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -10,7 +10,7 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16
 
-define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -92,7 +92,7 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
+define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_load_global_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -154,7 +154,7 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
   ret <2 x bfloat> %load
 }
 
-define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
+define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_load_global_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -218,7 +218,7 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
   ret <3 x bfloat> %load
 }
 
-define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
+define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_load_global_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -280,7 +280,7 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
   ret <4 x bfloat> %load
 }
 
-define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
+define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_load_global_v6bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -342,7 +342,7 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
   ret <6 x bfloat> %load
 }
 
-define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
+define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_load_global_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -404,7 +404,7 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
   ret <8 x bfloat> %load
 }
 
-define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
+define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_load_global_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -503,7 +503,7 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
   ret <16 x bfloat> %load
 }
 
-define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
+define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_load_global_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -622,7 +622,7 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
   ret <32 x bfloat> %load
 }
 
-define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
+define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_load_global_v64bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -785,7 +785,7 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
   ret <64 x bfloat> %load
 }
 
-define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
+define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_store_global_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -854,7 +854,7 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
   ret void
 }
 
-define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
+define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_store_global_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -924,7 +924,7 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
   ret void
 }
 
-define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
+define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_store_global_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -983,7 +983,7 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
   ret void
 }
 
-define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
+define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_store_global_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1042,7 +1042,7 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
   ret void
 }
 
-define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
+define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_store_global_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1113,7 +1113,7 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
   ret void
 }
 
-define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
+define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_store_global_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1202,7 +1202,7 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
   ret void
 }
 
-define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
+define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: v_store_global_v64bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1381,7 +1381,7 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
   ret void
 }
 
-define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
+define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) #0 {
 ; GCN-LABEL: test_store_fpimm:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1481,7 +1481,7 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
   ret void
 }
 
-define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store_f32_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1596,7 +1596,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
   ret void
 }
 
-define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store_f64_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1777,7 +1777,7 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
   ret void
 }
 
-define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store_bf16_to_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1860,7 +1860,7 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
   ret void
 }
 
-define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store_bf16_to_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1952,7 +1952,7 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
   ret void
 }
 
-define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2026,7 +2026,7 @@ define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
   ret void
 }
 
-define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2100,7 +2100,7 @@ define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
   ret void
 }
 
-define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2174,7 +2174,7 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
   ret void
 }
 
-define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_load_store_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2276,7 +2276,7 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
+define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_arg_store:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2345,7 +2345,7 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
+define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_arg_store_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2414,7 +2414,7 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
+define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_arg_store_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2484,7 +2484,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
+define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_arg_store_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2543,7 +2543,7 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
+define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_arg_store_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2602,7 +2602,7 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
+define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_arg_store_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2673,7 +2673,7 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
+define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_inreg_arg_store:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2754,7 +2754,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
   ret void
 }
 
-define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
+define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) #0 {
 ; GCN-LABEL: test_byval:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2828,7 +2828,7 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
   ret bfloat %retval
 }
 
-define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
+define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) #0 {
 ; GCN-LABEL: test_sret:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2886,7 +2886,7 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
   ret void
 }
 
-define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: test_bitcast_from_bfloat:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2969,7 +2969,7 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
   ret void
 }
 
-define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-LABEL: test_bitcast_to_bfloat:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3052,7 +3052,7 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in)
   ret void
 }
 
-define bfloat @test_ret(bfloat %in) {
+define bfloat @test_ret(bfloat %in) #0 {
 ; GCN-LABEL: test_ret:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3092,7 +3092,7 @@ entry:
   ret bfloat %in
 }
 
-define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
+define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) #0 {
 ; GCN-LABEL: test_ret_v2bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3132,7 +3132,7 @@ entry:
   ret <2 x bfloat> %in
 }
 
-define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
+define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) #0 {
 ; GCN-LABEL: test_ret_v3bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3174,7 +3174,7 @@ entry:
   ret <3 x bfloat> %in
 }
 
-define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
+define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) #0 {
 ; GCN-LABEL: test_ret_v4bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3214,7 +3214,7 @@ entry:
   ret <4 x bfloat> %in
 }
 
-define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
+define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) #0 {
 ; GCN-LABEL: test_ret_v8bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3254,7 +3254,7 @@ entry:
   ret <8 x bfloat> %in
 }
 
-define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
+define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) #0 {
 ; GCN-LABEL: test_ret_v16bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3294,7 +3294,7 @@ entry:
   ret <16 x bfloat> %in
 }
 
-define void @test_call(bfloat %in, ptr addrspace(5) %out) {
+define void @test_call(bfloat %in, ptr addrspace(5) %out) #0 {
 ; GCN-LABEL: test_call:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3315,8 +3315,8 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3342,10 +3342,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3371,10 +3371,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3400,10 +3400,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3426,13 +3426,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    scratch_store_short v1, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -3459,10 +3460,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3489,10 +3490,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    scratch_store_b16 v1, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -3519,10 +3521,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    scratch_store_b16 v1, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -3538,7 +3541,7 @@ entry:
   ret void
 }
 
-define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
+define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) #0 {
 ; GCN-LABEL: test_call_v2bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3559,8 +3562,8 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3586,10 +3589,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3615,10 +3618,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3644,10 +3647,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3670,13 +3673,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    scratch_store_dword v1, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -3703,10 +3707,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3733,10 +3737,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    scratch_store_b32 v1, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -3763,10 +3768,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    scratch_store_b32 v1, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -3782,7 +3788,7 @@ entry:
   ret void
 }
 
-define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
+define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) #0 {
 ; GCN-LABEL: test_call_v3bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3807,8 +3813,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v4, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    v_readlane_b32 s31, v4, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3836,12 +3842,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v2
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3868,12 +3874,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3899,12 +3905,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3927,16 +3933,17 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    v_mov_b32_e32 v4, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_short v4, v1, off offset:4 sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    scratch_store_dword v4, v0, off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -3963,12 +3970,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3995,12 +4002,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -4028,13 +4036,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 v4, v0, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -4050,7 +4059,7 @@ entry:
   ret void
 }
 
-define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
+define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) #0 {
 ; GCN-LABEL: test_call_v4bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4082,8 +4091,8 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v3, v7, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v8, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v8, 0
+; GCN-NEXT:    v_readlane_b32 s31, v8, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4120,10 +4129,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 2, v2
+; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX7-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4150,12 +4159,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4181,12 +4190,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4209,14 +4218,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    v_mov_b32_e32 v4, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -4243,12 +4253,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4275,10 +4285,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -4306,10 +4317,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -4325,7 +4337,7 @@ entry:
   ret void
 }
 
-define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
+define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) #0 {
 ; GCN-LABEL: test_call_v8bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4371,8 +4383,8 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v5, v15, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v16, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v16, 0
+; GCN-NEXT:    v_readlane_b32 s31, v16, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4423,10 +4435,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 2, v4
+; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX7-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4459,12 +4471,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4490,6 +4502,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX900-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -4499,7 +4512,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4522,13 +4534,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    scratch_store_dwordx4 v4, v[0:3], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -4555,6 +4568,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -4564,7 +4578,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4591,10 +4604,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
@@ -4621,10 +4635,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -4640,7 +4655,7 @@ entry:
   ret void
 }
 
-define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
+define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) #0 {
 ; GCN-LABEL: test_call_v16bf16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4714,8 +4729,8 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v9, v6, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v20, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v20, 0
+; GCN-NEXT:    v_readlane_b32 s31, v20, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4794,10 +4809,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 2, v8
+; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
 ; GFX7-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v18, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4842,12 +4857,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
+; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4873,6 +4888,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX900-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -4890,7 +4906,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX900-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4913,15 +4928,16 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX950-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX950-NEXT:    scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    scratch_store_dwordx4 v8, v[0:3], off sc0 sc1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX950-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX950-NEXT:    s_mov_b32 s32, s33
 ; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX950-NEXT:    scratch_load_dword v9, off, s33 ; 4-byte Folded Reload
@@ -4948,6 +4964,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX10-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -4965,7 +4982,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4992,12 +5008,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    scratch_store_b128 v8, v[4:7], off offset:16 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_store_b128 v8, v[0:3], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
@@ -5024,13 +5041,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX1250-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX1250-NEXT:    scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX1250-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX1250-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -5046,7 +5064,7 @@ entry:
   ret void
 }
 
-define bfloat @test_alloca_load_store_ret(bfloat %in) {
+define bfloat @test_alloca_load_store_ret(bfloat %in) #0 {
 ; GCN-LABEL: test_alloca_load_store_ret:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5135,7 +5153,7 @@ entry:
   ret bfloat %loaded
 }
 
-define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
+define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) #0 {
 ; GCN-LABEL: test_overflow_stack:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5586,7 +5604,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
   ret { <32 x i32>, bfloat } %ins.1
 }
 
-define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
+define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v2bf16_to_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5664,7 +5682,7 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
   ret <2 x float> %fpext
 }
 
-define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
+define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v3bf16_to_v3f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5759,7 +5777,7 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
   ret <3 x float> %fpext
 }
 
-define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
+define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v4bf16_to_v4f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5851,7 +5869,7 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
   ret <4 x float> %fpext
 }
 
-define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
+define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v5bf16_to_v5f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5954,7 +5972,7 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
   ret <5 x float> %fpext
 }
 
-define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
+define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v6bf16_to_v6f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6072,7 +6090,7 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
   ret <6 x float> %fpext
 }
 
-define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
+define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v8bf16_to_v8f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6191,7 +6209,7 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
   ret <8 x float> %fpext
 }
 
-define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
+define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v16bf16_to_v16f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6384,7 +6402,7 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
   ret <16 x float> %fpext
 }
 
-define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
+define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v32bf16_to_v32f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6719,7 +6737,7 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
   ret <32 x float> %fpext
 }
 
-define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
+define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v2bf16_to_v2f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6824,7 +6842,7 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
   ret <2 x double> %fpext
 }
 
-define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
+define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v3bf16_to_v3f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6946,7 +6964,7 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
   ret <3 x double> %fpext
 }
 
-define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
+define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v4bf16_to_v4f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7069,7 +7087,7 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
   ret <4 x double> %fpext
 }
 
-define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
+define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v5bf16_to_v5f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7207,7 +7225,7 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
   ret <5 x double> %fpext
 }
 
-define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
+define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v6bf16_to_v6f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7355,7 +7373,7 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
   ret <6 x double> %fpext
 }
 
-define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
+define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v8bf16_to_v8f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7529,7 +7547,7 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
   ret <8 x double> %fpext
 }
 
-define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
+define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v16bf16_to_v16f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7832,7 +7850,7 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
   ret <16 x double> %fpext
 }
 
-define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
+define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_extload_v32bf16_to_v32f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9474,7 +9492,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
   ret <32 x double> %fpext
 }
 
-define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
+define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fadd_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9592,7 +9610,7 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fadd_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9764,7 +9782,7 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fadd_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9994,7 +10012,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fadd_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10276,7 +10294,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
   ret <4 x bfloat> %op
 }
 
-define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fadd_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10792,7 +10810,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fadd_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11768,7 +11786,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fadd_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13628,7 +13646,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
   ret <32 x bfloat> %op
 }
 
-define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
+define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) #0 {
 ; GCN-LABEL: v_fadd_bf16_fpimm_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13739,7 +13757,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
   ret bfloat %add
 }
 
-define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
+define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) #0 {
 ; GCN-LABEL: v_fadd_bf16_fpimm_1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13850,7 +13868,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
   ret bfloat %add
 }
 
-define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
+define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fsub_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13968,7 +13986,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fsub_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14140,7 +14158,7 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fsub_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14370,7 +14388,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fsub_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14652,7 +14670,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
   ret <4 x bfloat> %op
 }
 
-define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
+define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fmul_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14768,7 +14786,7 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fmul_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14940,7 +14958,7 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fmul_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15170,7 +15188,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fmul_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15452,7 +15470,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
   ret <4 x bfloat> %op
 }
 
-define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fmul_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15968,7 +15986,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fmul_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16944,7 +16962,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) #0 {
 ; GCN-LABEL: v_fmul_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18804,7 +18822,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
   ret <32 x bfloat> %op
 }
 
-define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
+define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fdiv_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19031,7 +19049,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 
 declare bfloat @llvm.fabs.bf16(bfloat)
 
-define bfloat @v_fabs_bf16(bfloat %a) {
+define bfloat @v_fabs_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19091,7 +19109,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
   ret bfloat %op
 }
 
-define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
+define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) #0 {
 ; GCN-LABEL: s_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
@@ -19141,7 +19159,7 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
   ret i32 %readlane
 }
 
-define bfloat @v_fneg_bf16(bfloat %a) {
+define bfloat @v_fneg_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_fneg_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19204,7 +19222,7 @@ define bfloat @v_fneg_bf16(bfloat %a) {
 declare i32 @llvm.amdgcn.readfirstlane(i32)
 
 ; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
+define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) #0 {
 ; GCN-LABEL: s_fneg_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_xor_b32 s0, s0, 0x8000
@@ -19256,7 +19274,7 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
   ret i32 %readlane
 }
 
-define bfloat @v_fneg_fabs_bf16(bfloat %a) {
+define bfloat @v_fneg_fabs_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_fneg_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19318,7 +19336,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 }
 
 ; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
+define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) #0 {
 ; GCN-LABEL: s_fneg_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_bitset1_b32 s0, 15
@@ -19379,7 +19397,7 @@ declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
 declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
 declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
 
-define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
+define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_minnum_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19502,7 +19520,7 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; GCN-LABEL: v_minnum_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19682,7 +19700,7 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) #0 {
 ; GCN-LABEL: v_minnum_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19924,7 +19942,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
 ; GCN-LABEL: v_minnum_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20222,7 +20240,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
   ret <4 x bfloat> %op
 }
 
-define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; GCN-LABEL: v_minnum_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20770,7 +20788,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) #0 {
 ; GCN-LABEL: v_minnum_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21810,7 +21828,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) #0 {
 ; GCN-LABEL: v_minnum_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23807,7 +23825,7 @@ declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
 declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
 declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
 
-define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
+define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_maxnum_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23930,7 +23948,7 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; GCN-LABEL: v_maxnum_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24110,7 +24128,7 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) #0 {
 ; GCN-LABEL: v_maxnum_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24352,7 +24370,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
 ; GCN-LABEL: v_maxnum_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24650,7 +24668,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
   ret <4 x bfloat> %op
 }
 
-define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; GCN-LABEL: v_maxnum_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25198,7 +25216,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) #0 {
 ; GCN-LABEL: v_maxnum_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26238,7 +26256,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) #0 {
 ; GCN-LABEL: v_maxnum_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28228,7 +28246,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 
 declare bfloat @llvm.sqrt.bf16(bfloat)
 
-define bfloat @v_sqrt_bf16(bfloat %a) {
+define bfloat @v_sqrt_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_sqrt_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28490,7 +28508,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
   ret bfloat %op
 }
 
-define bfloat @v_rsq_bf16(bfloat %x) {
+define bfloat @v_rsq_bf16(bfloat %x) #0 {
 ; GCN-LABEL: v_rsq_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28894,7 +28912,7 @@ define bfloat @v_rsq_bf16(bfloat %x) {
   ret bfloat %rsq
 }
 
-define bfloat @v_neg_rsq_bf16(bfloat %x) {
+define bfloat @v_neg_rsq_bf16(bfloat %x) #0 {
 ; GCN-LABEL: v_neg_rsq_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29306,7 +29324,7 @@ define bfloat @v_neg_rsq_bf16(bfloat %x) {
 
 declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
 
-define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
+define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) #0 {
 ; GCN-LABEL: v_ldexp_bf16_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29419,7 +29437,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 
 declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
 
-define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
+define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) #0 {
 ; GCN-LABEL: v_frexp_bf16_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29549,7 +29567,7 @@ declare bfloat @llvm.log.bf16(bfloat)
 declare bfloat @llvm.log2.bf16(bfloat)
 declare bfloat @llvm.log10.bf16(bfloat)
 
-define bfloat @v_log_bf16(bfloat %a) {
+define bfloat @v_log_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_log_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29805,7 +29823,7 @@ define bfloat @v_log_bf16(bfloat %a) {
   ret bfloat %op
 }
 
-define bfloat @v_log2_bf16(bfloat %a) {
+define bfloat @v_log2_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_log2_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29975,7 +29993,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
   ret bfloat %op
 }
 
-define bfloat @v_log10_bf16(bfloat %a) {
+define bfloat @v_log10_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_log10_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30235,7 +30253,7 @@ declare bfloat @llvm.exp.bf16(bfloat)
 declare bfloat @llvm.exp2.bf16(bfloat)
 declare bfloat @llvm.exp10.bf16(bfloat)
 
-define bfloat @v_exp_bf16(bfloat %a) {
+define bfloat @v_exp_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_exp_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30531,7 +30549,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
   ret bfloat %op
 }
 
-define bfloat @v_exp2_bf16(bfloat %a) {
+define bfloat @v_exp2_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_exp2_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30705,7 +30723,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
   ret bfloat %op
 }
 
-define bfloat @v_exp10_bf16(bfloat %a) {
+define bfloat @v_exp10_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_exp10_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31003,7 +31021,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 
 declare bfloat @llvm.ceil.bf16(bfloat)
 
-define bfloat @v_ceil_bf16(bfloat %a) {
+define bfloat @v_ceil_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_ceil_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31116,7 +31134,7 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 
 declare bfloat @llvm.trunc.bf16(bfloat)
 
-define bfloat @v_trunc_bf16(bfloat %a) {
+define bfloat @v_trunc_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_trunc_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31229,7 +31247,7 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 
 declare bfloat @llvm.rint.bf16(bfloat)
 
-define bfloat @v_rint_bf16(bfloat %a) {
+define bfloat @v_rint_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_rint_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31342,7 +31360,7 @@ define bfloat @v_rint_bf16(bfloat %a) {
 
 declare bfloat @llvm.nearbyint.bf16(bfloat)
 
-define bfloat @v_nearbyint_bf16(bfloat %a) {
+define bfloat @v_nearbyint_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_nearbyint_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31455,7 +31473,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 
 declare bfloat @llvm.round.bf16(bfloat)
 
-define bfloat @v_round_bf16(bfloat %a) {
+define bfloat @v_round_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_round_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31626,7 +31644,7 @@ define bfloat @v_round_bf16(bfloat %a) {
 
 declare bfloat @llvm.roundeven.bf16(bfloat)
 
-define bfloat @v_roundeven_bf16(bfloat %a) {
+define bfloat @v_roundeven_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_roundeven_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31739,7 +31757,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 
 declare bfloat @llvm.floor.bf16(bfloat)
 
-define bfloat @v_floor_bf16(bfloat %a) {
+define bfloat @v_floor_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_floor_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31852,7 +31870,7 @@ define bfloat @v_floor_bf16(bfloat %a) {
 
 declare bfloat @llvm.canonicalize.bf16(bfloat)
 
-define bfloat @v_canonicalize_bf16(bfloat %a) {
+define bfloat @v_canonicalize_bf16(bfloat %a) #0 {
 ; GCN-LABEL: v_canonicalize_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31966,12 +31984,12 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
 
 ; FIXME: Promotion broken
-; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
+; define bfloat @v_arithmetic_fence_bf16(bfloat %a) #0 {
 ;   %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
 ;   ret bfloat %op
 ; }
 
-define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_false_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32018,7 +32036,7 @@ define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_oeq_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32097,7 +32115,7 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_ogt_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32176,7 +32194,7 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_oge_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32255,7 +32273,7 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_olt_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32334,7 +32352,7 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_ole_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32413,7 +32431,7 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_one_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32492,7 +32510,7 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_uno_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32571,7 +32589,7 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_ueq_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32650,7 +32668,7 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_ugt_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32729,7 +32747,7 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_uge_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32808,7 +32826,7 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_ult_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32887,7 +32905,7 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_ule_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32966,7 +32984,7 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_une_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33045,7 +33063,7 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
+define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_fcmp_true_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33092,7 +33110,7 @@ define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
   ret i1 %op
 }
 
-define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
+define i16 @v_fptosi_bf16_to_i16(bfloat %x) #0 {
 ; GCN-LABEL: v_fptosi_bf16_to_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33148,7 +33166,7 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
   ret i16 %op
 }
 
-define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
+define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33237,7 +33255,7 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
   ret <2 x i16> %op
 }
 
-define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
+define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33343,7 +33361,7 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
   ret <3 x i16> %op
 }
 
-define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
+define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33471,7 +33489,7 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
   ret <4 x i16> %op
 }
 
-define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
+define i32 @v_fptosi_bf16_to_i32(bfloat %x) #0 {
 ; GCN-LABEL: v_fptosi_bf16_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33527,7 +33545,7 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
   ret i32 %op
 }
 
-define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
+define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33600,7 +33618,7 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
   ret <2 x i32> %op
 }
 
-define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
+define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33691,7 +33709,7 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
   ret <3 x i32> %op
 }
 
-define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
+define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33796,7 +33814,7 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
   ret <4 x i32> %op
 }
 
-define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
+define i64 @v_fptosi_bf16_to_i64(bfloat %x) #0 {
 ; GCN-LABEL: v_fptosi_bf16_to_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33956,7 +33974,7 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
   ret i64 %op
 }
 
-define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
+define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34219,7 +34237,7 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
   ret <2 x i64> %op
 }
 
-define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
+define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34580,7 +34598,7 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
   ret <3 x i64> %op
 }
 
-define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
+define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) #0 {
 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35033,7 +35051,7 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
   ret <4 x i64> %op
 }
 
-define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
+define bfloat @v_sitofp_i16_to_bf16(i16 %x) #0 {
 ; GCN-LABEL: v_sitofp_i16_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35140,7 +35158,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
+define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) #0 {
 ; GCN-LABEL: v_sitofp_v2i16_to_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35294,7 +35312,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
+define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) #0 {
 ; GCN-LABEL: v_sitofp_v3i16_to_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35517,7 +35535,7 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
+define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) #0 {
 ; GCN-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35765,7 +35783,7 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
   ret <4 x bfloat> %op
 }
 
-define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
+define bfloat @v_sitofp_i32_to_bf16(i32 %x) #0 {
 ; GCN-LABEL: v_sitofp_i32_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35865,7 +35883,7 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
+define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) #0 {
 ; GCN-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36006,7 +36024,7 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
+define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) #0 {
 ; GCN-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36205,7 +36223,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
+define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) #0 {
 ; GCN-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36427,7 +36445,7 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
   ret <4 x bfloat> %op
 }
 
-define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
+define bfloat @v_sitofp_i64_to_bf16(i64 %x) #0 {
 ; GCN-LABEL: v_sitofp_i64_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36638,7 +36656,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
+define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) #0 {
 ; GCN-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36999,7 +37017,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
+define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) #0 {
 ; GCN-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37554,7 +37572,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
+define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) #0 {
 ; GCN-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38195,7 +38213,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
   ret <4 x bfloat> %op
 }
 
-define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
+define bfloat @v_uitofp_i16_to_bf16(i16 %x) #0 {
 ; GCN-LABEL: v_uitofp_i16_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38314,7 +38332,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
+define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) #0 {
 ; GCN-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38483,7 +38501,7 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
+define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) #0 {
 ; GCN-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38712,7 +38730,7 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
+define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) #0 {
 ; GCN-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38986,7 +39004,7 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
   ret <4 x bfloat> %op
 }
 
-define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
+define bfloat @v_uitofp_i32_to_bf16(i32 %x) #0 {
 ; GCN-LABEL: v_uitofp_i32_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39086,7 +39104,7 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
+define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) #0 {
 ; GCN-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39227,7 +39245,7 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
+define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) #0 {
 ; GCN-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39426,7 +39444,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
+define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) #0 {
 ; GCN-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39648,7 +39666,7 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
   ret <4 x bfloat> %op
 }
 
-define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
+define bfloat @v_uitofp_i64_to_bf16(i64 %x) #0 {
 ; GCN-LABEL: v_uitofp_i64_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39819,7 +39837,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
+define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) #0 {
 ; GCN-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40103,7 +40121,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
+define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) #0 {
 ; GCN-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40540,7 +40558,7 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
+define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) #0 {
 ; GCN-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41037,7 +41055,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
   ret <4 x bfloat> %op
 }
 
-define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
+define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_select_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41128,7 +41146,7 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
   ret bfloat %op
 }
 
-define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
+define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_select_fneg_lhs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41230,7 +41248,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
   ret bfloat %op
 }
 
-define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
+define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) #0 {
 ; GCN-LABEL: v_select_fneg_rhs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41332,7 +41350,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
+define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; GCN-LABEL: v_select_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41454,7 +41472,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
 }
 
-define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
+define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; GCN-LABEL: v_vselect_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41592,7 +41610,7 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
   ret <2 x bfloat> %op
 }
 
-define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
+define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) #0 {
 ; GCN-LABEL: s_select_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -41705,7 +41723,7 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
   ret i32 %readlane
 }
 
-define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
+define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) #0 {
 ; GCN-LABEL: s_select_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshr_b32 s2, s0, 16
@@ -41868,7 +41886,7 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
   ret i32 %readlane
 }
 
-define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
+define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) #0 {
 ; GCN-LABEL: s_vselect_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
@@ -42032,7 +42050,7 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
   ret i32 %readlane
 }
 
-define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
+define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) #0 {
 ; GCN-LABEL: v_select_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42112,7 +42130,7 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) #0 {
 ; GCN-LABEL: v_select_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42190,7 +42208,7 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
 }
 
-define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
+define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) #0 {
 ; GCN-LABEL: v_select_v6bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42276,7 +42294,7 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
   ret <6 x bfloat> %op
 }
 
-define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; GCN-LABEL: v_select_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42368,7 +42386,7 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
+define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) #0 {
 ; GCN-LABEL: v_select_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42488,7 +42506,7 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
+define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) #0 {
 ; GCN-LABEL: v_select_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42697,7 +42715,7 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
   ret <32 x bfloat> %op
 }
 
-define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
+define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) #0 {
 ; GCN-LABEL: s_select_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -42822,7 +42840,7 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
   ret <2 x i32> %bv.1
 }
 
-define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
+define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) #0 {
 ; GCN-LABEL: s_select_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -42936,7 +42954,7 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
   ret <2 x i32> %bv.1
 }
 
-define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
+define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) #0 {
 ; GCN-LABEL: s_vselect_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
@@ -43208,7 +43226,7 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
   ret <2 x i32> %bv.1
 }
 
-define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) #0 {
 ; GCN-LABEL: v_vselect_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43429,7 +43447,7 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
   ret <4 x bfloat> %op
 }
 
-define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; GCN-LABEL: v_vselect_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43815,7 +43833,7 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
+define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) #0 {
 ; GCN-LABEL: v_vselect_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44538,7 +44556,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
+define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) #0 {
 ; GCN-LABEL: v_vselect_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44763,18 +44781,18 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v26
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v27
+; GFX7-NEXT:    v_writelane_b32 v33, s34, 0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v28
+; GFX7-NEXT:    v_writelane_b32 v33, s35, 1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[92:93], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX7-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v33, s30, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[94:95], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX7-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v33, s31, 3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX7-NEXT:    v_writelane_b32 v33, s34, 2
-; GFX7-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
@@ -44844,6 +44862,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v1, v0, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    v_readlane_b32 s30, v33, 2
 ; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v3
 ; GFX7-NEXT:    v_bfi_b32 v1, s4, v2, v5
 ; GFX7-NEXT:    v_bfi_b32 v2, s4, v4, v7
@@ -44860,10 +44879,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX7-NEXT:    v_bfi_b32 v13, s4, v26, v29
 ; GFX7-NEXT:    v_bfi_b32 v14, s4, v28, v32
 ; GFX7-NEXT:    v_bfi_b32 v15, s4, v31, v30
-; GFX7-NEXT:    v_readlane_b32 s35, v33, 3
-; GFX7-NEXT:    v_readlane_b32 s34, v33, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v33, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v33, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v33, 1
+; GFX7-NEXT:    v_readlane_b32 s34, v33, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
@@ -44919,34 +44937,34 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
+; GFX8-NEXT:    v_writelane_b32 v34, s34, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
+; GFX8-NEXT:    v_writelane_b32 v34, s35, 1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[72:73], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
+; GFX8-NEXT:    v_writelane_b32 v34, s36, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[74:75], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
+; GFX8-NEXT:    v_writelane_b32 v34, s37, 3
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[76:77], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v34, s38, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[78:79], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v34, s39, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v34, s30, 6
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v34, s31, 7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT:    v_writelane_b32 v34, s36, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT:    v_writelane_b32 v34, s37, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT:    v_writelane_b32 v34, s38, 6
-; GFX8-NEXT:    v_writelane_b32 v34, s39, 7
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
@@ -45072,6 +45090,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX8-NEXT:    v_readlane_b32 s30, v34, 6
 ; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -45080,14 +45099,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readlane_b32 s39, v34, 7
-; GFX8-NEXT:    v_readlane_b32 s38, v34, 6
-; GFX8-NEXT:    v_readlane_b32 s37, v34, 5
-; GFX8-NEXT:    v_readlane_b32 s36, v34, 4
-; GFX8-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX8-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v34, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v34, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v34, 5
+; GFX8-NEXT:    v_readlane_b32 s38, v34, 4
+; GFX8-NEXT:    v_readlane_b32 s37, v34, 3
+; GFX8-NEXT:    v_readlane_b32 s36, v34, 2
+; GFX8-NEXT:    v_readlane_b32 s35, v34, 1
+; GFX8-NEXT:    v_readlane_b32 s34, v34, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45159,11 +45177,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_and_b32_e32 v0, 1, v28
 ; GFX900-NEXT:    v_cmp_eq_u32_e64 s[94:95], 1, v0
 ; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX900-NEXT:    v_writelane_b32 v33, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v33, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v33, s34, 2
+; GFX900-NEXT:    v_writelane_b32 v33, s34, 0
+; GFX900-NEXT:    v_writelane_b32 v33, s35, 1
+; GFX900-NEXT:    v_writelane_b32 v33, s30, 2
+; GFX900-NEXT:    v_writelane_b32 v33, s31, 3
 ; GFX900-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX900-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -45268,6 +45286,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_readlane_b32 s30, v33, 2
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v2, v5, s4
 ; GFX900-NEXT:    v_perm_b32 v2, v4, v7, s4
@@ -45284,10 +45303,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_perm_b32 v13, v26, v29, s4
 ; GFX900-NEXT:    v_perm_b32 v14, v28, v32, s4
 ; GFX900-NEXT:    v_perm_b32 v15, v31, v30, s4
-; GFX900-NEXT:    v_readlane_b32 s35, v33, 3
-; GFX900-NEXT:    v_readlane_b32 s34, v33, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v33, 0
+; GFX900-NEXT:    v_readlane_b32 s31, v33, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v33, 1
+; GFX900-NEXT:    v_readlane_b32 s34, v33, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -46400,7 +46418,7 @@ declare <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
 declare <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>)
 declare <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>)
 
-define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
+define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; GCN-LABEL: v_fma_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46524,7 +46542,7 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fma_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46710,7 +46728,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
+define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fma_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46968,7 +46986,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fma_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47286,7 +47304,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
   ret <4 x bfloat> %op
 }
 
-define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
+define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fma_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47867,7 +47885,7 @@ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat>
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) {
+define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fma_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -48949,7 +48967,7 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
+define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fma_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -51364,7 +51382,7 @@ declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloa
 declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
 declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
 
-define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
+define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; GCN-LABEL: v_fmuladd_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -51492,7 +51510,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fmuladd_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -51686,7 +51704,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
+define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fmuladd_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -51956,7 +51974,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) #0 {
 ; GCN-LABEL: v_fmuladd_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52289,3 +52307,5 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
   %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index ab2ad19d0f1bf..fb11d3b7d9d65 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -902,47 +902,47 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt expcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v0, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s33, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s34, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s35, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s36, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s37, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s38, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s39, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s48, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s49, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s50, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s51, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s52, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s53, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s54, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s55, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s64, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s65, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s66, 19
-; CHECK-NEXT:    v_writelane_b32 v0, s67, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s68, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s69, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s70, 23
-; CHECK-NEXT:    v_writelane_b32 v0, s71, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s80, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s81, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s82, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s83, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s84, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s85, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s86, 31
-; CHECK-NEXT:    v_writelane_b32 v0, s87, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s96, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s97, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s98, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s99, 36
+; CHECK-NEXT:    v_writelane_b32 v0, s33, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s34, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s35, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s36, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s37, 4
+; CHECK-NEXT:    v_writelane_b32 v0, s38, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s39, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s48, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s49, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s50, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s51, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s52, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s53, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s54, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s55, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s64, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s65, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s66, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s67, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s68, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s69, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s70, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s71, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s80, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s81, 24
+; CHECK-NEXT:    v_writelane_b32 v0, s82, 25
+; CHECK-NEXT:    v_writelane_b32 v0, s83, 26
+; CHECK-NEXT:    v_writelane_b32 v0, s84, 27
+; CHECK-NEXT:    v_writelane_b32 v0, s85, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s86, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s87, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s96, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s97, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s98, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s99, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s100, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s101, 36
 ; CHECK-NEXT:    s_mov_b32 s40, s12
-; CHECK-NEXT:    v_writelane_b32 v0, s100, 37
+; CHECK-NEXT:    v_writelane_b32 v0, s30, 37
 ; CHECK-NEXT:    s_cmp_eq_u32 s40, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s101, 38
+; CHECK-NEXT:    v_writelane_b32 v0, s31, 38
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1380,6 +1380,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use s31
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s30, v0, 37
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use s32
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1596,45 +1597,44 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use vcc_hi
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s101, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s100, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s99, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s98, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s97, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s96, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s87, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s86, v0, 31
-; CHECK-NEXT:    v_readlane_b32 s85, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s84, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s83, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s82, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s81, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s80, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s71, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s70, v0, 23
-; CHECK-NEXT:    v_readlane_b32 s69, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s68, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s67, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s66, v0, 19
-; CHECK-NEXT:    v_readlane_b32 s65, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s64, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s55, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s54, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s53, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s52, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s51, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s50, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s49, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s48, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s39, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s38, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s37, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s36, v0, 5
-; CHECK-NEXT:    v_readlane_b32 s35, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s34, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s33, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v0, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s101, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s100, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s99, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s98, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s97, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s96, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s87, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s86, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s85, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s84, v0, 27
+; CHECK-NEXT:    v_readlane_b32 s83, v0, 26
+; CHECK-NEXT:    v_readlane_b32 s82, v0, 25
+; CHECK-NEXT:    v_readlane_b32 s81, v0, 24
+; CHECK-NEXT:    v_readlane_b32 s80, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s71, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s70, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s69, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s68, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s67, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s66, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s65, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s64, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s55, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s52, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s51, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s50, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s49, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s48, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s39, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s38, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s37, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s36, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s35, v0, 2
+; CHECK-NEXT:    v_readlane_b32 s34, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s33, v0, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
index bf43474d1bd7f..485cedb33dcca 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -25,8 +25,8 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -53,8 +53,8 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -84,8 +84,8 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -112,8 +112,8 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
index c4558c7083ba2..2d49681c22496 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll
@@ -43,8 +43,8 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s42
 ; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -76,8 +76,8 @@ define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s43
 ; CHECK-NEXT:    v_writelane_b32 v2, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -109,8 +109,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s43
 ; CHECK-NEXT:    v_writelane_b32 v2, s31, 1
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -121,3 +121,5 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
   call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
   ret void
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index 084cfcee28c63..ddd94108a1e39 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -48,8 +48,8 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -76,8 +76,8 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -107,8 +107,8 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -135,8 +135,8 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -166,8 +166,8 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -194,8 +194,8 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -225,8 +225,8 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -253,8 +253,8 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -284,8 +284,8 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -312,8 +312,8 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -343,8 +343,8 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -371,8 +371,8 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -402,8 +402,8 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -430,8 +430,8 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -461,8 +461,8 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -489,8 +489,8 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -520,8 +520,8 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -548,8 +548,8 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -579,8 +579,8 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -607,8 +607,8 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -638,8 +638,8 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -666,8 +666,8 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -697,8 +697,8 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -725,8 +725,8 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -756,8 +756,8 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -784,8 +784,8 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -815,8 +815,8 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -843,8 +843,8 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -874,8 +874,8 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -902,8 +902,8 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -933,8 +933,8 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -961,8 +961,8 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -992,8 +992,8 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1020,8 +1020,8 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1051,8 +1051,8 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 ; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1079,8 +1079,8 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1110,8 +1110,8 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1138,8 +1138,8 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1169,8 +1169,8 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 ; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1197,8 +1197,8 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1228,8 +1228,8 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 ; GFX9-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1256,8 +1256,8 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[26:27]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1289,8 +1289,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
 ; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1317,8 +1317,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 8bc188ed41335..834517d9c9b39 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -7179,8 +7179,8 @@ define void @stack_12xv3i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 10
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7247,8 +7247,8 @@ define void @stack_12xv3i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 10
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7315,8 +7315,8 @@ define void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7365,8 +7365,8 @@ define void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7433,8 +7433,8 @@ define void @stack_12xv3i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 10
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7518,8 +7518,8 @@ define void @stack_12xv3f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7586,8 +7586,8 @@ define void @stack_12xv3f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7654,8 +7654,8 @@ define void @stack_12xv3f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7708,8 +7708,8 @@ define void @stack_12xv3f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7776,8 +7776,8 @@ define void @stack_12xv3f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7869,8 +7869,8 @@ define void @stack_8xv5i32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 6
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -7945,8 +7945,8 @@ define void @stack_8xv5i32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 6
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8021,8 +8021,8 @@ define void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8076,8 +8076,8 @@ define void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8152,8 +8152,8 @@ define void @stack_8xv5i32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 6
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8241,8 +8241,8 @@ define void @stack_8xv5f32() #0 {
 ; VI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; VI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8317,8 +8317,8 @@ define void @stack_8xv5f32() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; CI-NEXT:    v_writelane_b32 v40, s31, 1
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8393,8 +8393,8 @@ define void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -8451,8 +8451,8 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8527,8 +8527,8 @@ define void @stack_8xv5f32() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; HSA-NEXT:    v_writelane_b32 v40, s31, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index c3bcaf1808acf..1d6299b5fc177 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -25,8 +25,8 @@ define void @use_vcc() #1 {
 ; GCN: v_writelane_b32 v40, s30, 0
 ; GCN: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s30, v40, 0
+; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s4, v40, 2
 ; GCN: s_mov_b32 s33, s4
 ; GCN: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 3117d5398f089..d443ad62ac7ef 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -40,22 +40,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT:    v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 2
 ; MUBUF-NEXT:    s_getpc_b64 s[34:35]
 ; MUBUF-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 3
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT:    v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 4
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -74,22 +74,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 2
 ; FLATSCR-NEXT:    s_getpc_b64 s[34:35]
 ; FLATSCR-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 3
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 4
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -114,20 +114,20 @@ define void @test_func_call_external_void_funcx2() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT:    v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 2
 ; MUBUF-NEXT:    s_getpc_b64 s[34:35]
 ; MUBUF-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 3
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT:    v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 4
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -146,20 +146,20 @@ define void @test_func_call_external_void_funcx2() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 2
 ; FLATSCR-NEXT:    s_getpc_b64 s[34:35]
 ; FLATSCR-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 3
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 4
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -185,8 +185,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s31, v0, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v0, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v0, 1
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
@@ -204,8 +204,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s31, v0, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v0, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v0, 1
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
@@ -452,23 +452,23 @@ define void @callee_saved_sgpr_func() #2 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s4, 3
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s34, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 2
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def s40
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    s_mov_b32 s34, s40
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 1
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; use s34
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s34, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 3
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -488,23 +488,23 @@ define void @callee_saved_sgpr_func() #2 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 3
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 1
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 2
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_mov_b32 s34, s40
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 1
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s34
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 3
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -555,13 +555,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
 ; MUBUF-NEXT:    v_writelane_b32 v41, s4, 3
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_writelane_b32 v41, s30, 0
-; MUBUF-NEXT:    v_writelane_b32 v41, s31, 1
+; MUBUF-NEXT:    v_writelane_b32 v41, s34, 0
+; MUBUF-NEXT:    v_writelane_b32 v41, s30, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT:    v_writelane_b32 v41, s34, 2
+; MUBUF-NEXT:    v_writelane_b32 v41, s31, 2
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; def s40
 ; MUBUF-NEXT:    ;;#ASMEND
@@ -577,9 +577,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; MUBUF-NEXT:    ; use v40
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT:    v_readlane_b32 s34, v41, 2
-; MUBUF-NEXT:    v_readlane_b32 s31, v41, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v41, 0
+; MUBUF-NEXT:    v_readlane_b32 s30, v41, 1
+; MUBUF-NEXT:    v_readlane_b32 s31, v41, 2
+; MUBUF-NEXT:    v_readlane_b32 s34, v41, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v41, 3
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -599,13 +599,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    v_writelane_b32 v41, s0, 3
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v41, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v41, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v41, s34, 0
+; FLATSCR-NEXT:    v_writelane_b32 v41, s30, 1
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT:    v_writelane_b32 v41, s34, 2
+; FLATSCR-NEXT:    v_writelane_b32 v41, s31, 2
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; def s40
 ; FLATSCR-NEXT:    ;;#ASMEND
@@ -621,9 +621,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
 ; FLATSCR-NEXT:    ; use v40
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT:    v_readlane_b32 s34, v41, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v41, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v41, 0
+; FLATSCR-NEXT:    v_readlane_b32 s30, v41, 1
+; FLATSCR-NEXT:    v_readlane_b32 s31, v41, 2
+; FLATSCR-NEXT:    v_readlane_b32 s34, v41, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v41, 3
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll
index 0b04111b609ad..e8bf70da933c0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-skip.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll
@@ -34,8 +34,8 @@ define void @if_call(i32 %flag) #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GCN-NEXT:  .LBB1_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v1, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v1, 0
+; GCN-NEXT:    v_readlane_b32 s31, v1, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 17159a3cf0a1a..962df52e984b2 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -132,8 +132,8 @@ define void @callee_with_stack_and_call() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -162,8 +162,8 @@ define void @callee_with_stack_and_call() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -201,8 +201,8 @@ define void @callee_no_stack_with_call() #0 {
 ; MUBUF-NEXT:    s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -228,8 +228,8 @@ define void @callee_no_stack_with_call() #0 {
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
@@ -359,24 +359,24 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
-; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 2
-; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 3
-; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 4
-; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 5
-; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 6
-; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 7
-; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 8
-; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 9
-; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 10
-; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 11
-; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 12
-; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 13
-; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 14
-; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 15
-; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 17
+; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 17
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    ;;#ASMSTART
@@ -414,6 +414,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[16:31]
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 16
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[72:79]
 ; FLATSCR-NEXT:    ;;#ASMEND
@@ -423,24 +424,23 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s[0:15]
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 17
-; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 16
-; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 15
-; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 14
-; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 13
-; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 12
-; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 11
-; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 10
-; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 9
-; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 8
-; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 7
-; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 6
-; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 5
-; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 4
-; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 3
-; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 2
-; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 0
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
@@ -971,14 +971,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
 ; MUBUF-NEXT:    v_writelane_b32 v1, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
-; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_writelane_b32 v1, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -997,14 +997,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
 ; FLATSCR-NEXT:    v_writelane_b32 v1, s30, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_writelane_b32 v1, s31, 1
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1037,17 +1037,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
-; MUBUF-NEXT:    s_addk_i32 s32, 0x300
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved initial VGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1066,17 +1066,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved initial VGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1118,18 +1118,18 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x1000
-; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved SGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; clobber nonpreserved VGPRs
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
@@ -1158,11 +1158,11 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; clobber nonpreserved VGPRs
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
@@ -1220,8 +1220,8 @@ define void @ipra_call_with_stack() #0 {
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
 ; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1248,8 +1248,8 @@ define void @ipra_call_with_stack() #0 {
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
 ; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index 5f965ba431ab5..bb5963244da3c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -430,8 +430,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -463,8 +463,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -496,8 +496,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -984,8 +984,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1048,8 +1048,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GFX90A-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1094,8 +1094,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1445,8 +1445,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 10
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index bb2f06bfe83f8..f20be656f3af0 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -275,8 +275,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -308,8 +308,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -341,8 +341,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -696,8 +696,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -742,8 +742,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1019,8 +1019,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 10
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -1469,8 +1469,8 @@ define void @func_call_no_workitem_id_hints() #2 {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 9
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
index 5797d2797f1ea..c63512f630aaf 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-entry.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
@@ -51,8 +51,8 @@ define void @caller() #0 {
 ; CHECK-NEXT:    s_wait_kmcnt 0x0
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
index ce9b8948e844a..2c463c5bfebac 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
@@ -41,11 +41,11 @@ define i32 @callee_returns_arg0(
     i32 inreg %a16, i32 inreg %a17, i32 inreg %a18, i32 inreg %a19,
     i32 inreg %a20, i32 inreg %a21, i32 inreg %a22, i32 inreg %a23,
     i32 inreg %a24, i32 inreg %a25, i32 inreg %a26, i32 inreg %a27,
-    i32 inreg %a28, i32 inreg %a29, i32 inreg %a30, i32 inreg %a31) {
+    i32 inreg %a28, i32 inreg %a29, i32 inreg %a30, i32 inreg %a31) #0 {
   ret i32 %a0
 }
 
-define i32 @caller_passes_42() {
+define i32 @caller_passes_42() #0 {
 ; CHECK-LABEL: caller_passes_42:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -153,8 +153,8 @@ define i32 @caller_passes_42() {
 ; SDAG-NEXT:    v_writelane_b32 v18, s31, 1
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; SDAG-NEXT:    v_readlane_b32 s31, v18, 1
 ; SDAG-NEXT:    v_readlane_b32 s30, v18, 0
+; SDAG-NEXT:    v_readlane_b32 s31, v18, 1
 ; SDAG-NEXT:    s_mov_b32 s32, s33
 ; SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; SDAG-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -212,8 +212,8 @@ define i32 @caller_passes_42() {
 ; GISEL-NEXT:    v_writelane_b32 v18, s31, 1
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; GISEL-NEXT:    v_readlane_b32 s31, v18, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v18, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v18, 1
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -232,3 +232,5 @@ define i32 @caller_passes_42() {
     i32 inreg 28, i32 inreg 29, i32 inreg 30, i32 inreg 31)
   ret i32 %r
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index ef676ddc8070e..b7f3578d06efc 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -149,7 +149,7 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
   ret <2 x double> %pow_sign1
 }
 
-define float @copysign_f32_f32_sign_known_p0_or_n0(float %x, i32 %y.i) {
+define float @copysign_f32_f32_sign_known_p0_or_n0(float %x, i32 %y.i) #2 {
 ; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -163,7 +163,7 @@ define float @copysign_f32_f32_sign_known_p0_or_n0(float %x, i32 %y.i) {
   ret float %copysign
 }
 
-define double @copysign_f64_f32_sign_known_p0_or_n0(double %x, i32 %y.i) {
+define double @copysign_f64_f32_sign_known_p0_or_n0(double %x, i32 %y.i) #2 {
 ; GFX9-LABEL: copysign_f64_f32_sign_known_p0_or_n0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -178,7 +178,7 @@ define double @copysign_f64_f32_sign_known_p0_or_n0(double %x, i32 %y.i) {
   ret double %copysign
 }
 
-define half @copysign_f16_f32_sign_known_p0_or_n0(half %x, i32 %y.i) {
+define half @copysign_f16_f32_sign_known_p0_or_n0(half %x, i32 %y.i) #2 {
 ; GFX9-LABEL: copysign_f16_f32_sign_known_p0_or_n0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -194,7 +194,7 @@ define half @copysign_f16_f32_sign_known_p0_or_n0(half %x, i32 %y.i) {
   ret half %copysign
 }
 
-define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_fabs(float %x.arg, i32 %y.i) {
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_fabs(float %x.arg, i32 %y.i) #2 {
 ; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_fabs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -209,7 +209,7 @@ define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_fabs(floa
   ret float %copysign
 }
 
-define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_select(float %x.arg, i32 %y.i) {
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_select(float %x.arg, i32 %y.i) #2 {
 ; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_select:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,7 +227,7 @@ define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_select(fl
   ret float %copysign
 }
 
-define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_nnan_nsz_sqrt(float %x.arg, i32 %y.i) {
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_nnan_nsz_sqrt(float %x.arg, i32 %y.i) #2 {
 ; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_nnan_nsz_sqrt:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -260,7 +260,7 @@ define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_nnan_nsz_
   ret float %copysign
 }
 
-define float @copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nsz_sqrt(float %x.arg, i32 %y.i) {
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nsz_sqrt(float %x.arg, i32 %y.i) #2 {
 ; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nsz_sqrt:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -293,7 +293,7 @@ define float @copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nsz_sqrt
   ret float %copysign
 }
 
-define float @copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nnan_sqrt(float %x.arg, i32 %y.i) {
+define float @copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nnan_sqrt(float %x.arg, i32 %y.i) #2 {
 ; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nnan_sqrt:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -326,7 +326,7 @@ define float @copysign_f32_f32_sign_known_p0_or_n0__mag_almost_positive_nnan_sqr
   ret float %copysign
 }
 
-define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
+define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) #2 {
 ; GFX9-LABEL: test_copysign_pow_fast_f32__integral_y:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -378,29 +378,29 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-NEXT:    v_writelane_b32 v43, s16, 14
-; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v43, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v43, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v43, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v43, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v43, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v43, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v43, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v43, s50, 10
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v43, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v43, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v43, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v43, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v43, s52, 10
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v43, s51, 11
+; GFX9-NEXT:    v_writelane_b32 v43, s53, 11
 ; GFX9-NEXT:    v_mov_b32_e32 v42, v1
-; GFX9-NEXT:    v_writelane_b32 v43, s52, 12
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 12
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; GFX9-NEXT:    s_getpc_b64 s[16:17]
 ; GFX9-NEXT:    s_add_u32 s16, s16, _Z4log2d at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s17, s17, _Z4log2d at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v43, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 13
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v2
 ; GFX9-NEXT:    s_mov_b32 s50, s15
@@ -432,21 +432,21 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 12
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX9-NEXT:    v_readlane_b32 s53, v43, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v43, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v43, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v43, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v43, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v43, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v43, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v43, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v43, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v43, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v43, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 13
+; GFX9-NEXT:    v_readlane_b32 s53, v43, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v43, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v43, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v43, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v43, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v43, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v43, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v43, 14
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -474,3 +474,4 @@ declare hidden double @_Z4log2d(double) #1
 
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #1 = { norecurse nounwind memory(read) }
+attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 45836ff81f774..5ca7a309cadad 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -40,8 +40,8 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -76,8 +76,8 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v3f32 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -112,8 +112,8 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v4f16 at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -148,8 +148,8 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_struct at rel32@hi+12
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
@@ -286,5 +286,5 @@ declare hidden <3 x half> @func_v3f16()
 
 declare hidden { <4 x i32>, <4 x half> } @func_struct() #0
 
-attributes #0 = { nounwind}
+attributes #0 = { nounwind }
 
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
index 405d6a8b4f969..972f897013419 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -2005,8 +2005,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX900-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX900-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX900-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX900-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX900-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX900-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2278,8 +2278,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX90A-V2A-DIS-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX90A-V2A-DIS-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-V2A-DIS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-DIS-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-V2A-DIS-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-V2A-DIS-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2551,8 +2551,8 @@ define hidden void @func_call_clobber() #0 {
 ; GFX90A-V2A-EN-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX90A-V2A-EN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX90A-V2A-EN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-V2A-EN-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-V2A-EN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX90A-V2A-EN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2793,8 +2793,8 @@ define hidden void @func_call_clobber() #0 {
 ; WAVE32-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; WAVE32-NEXT:    v_writelane_b32 v40, s31, 1
 ; WAVE32-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-NEXT:    v_readlane_b32 s31, v40, 1
 ; WAVE32-NEXT:    v_readlane_b32 s30, v40, 0
+; WAVE32-NEXT:    v_readlane_b32 s31, v40, 1
 ; WAVE32-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-NEXT:    v_readlane_b32 s4, v40, 2
 ; WAVE32-NEXT:    s_or_saveexec_b32 s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index a0c25b2a0beb3..fb2da9a5b934c 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -8,7 +8,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
 
 declare ptr @__kmpc_alloc_shared()
 
-define weak_odr void @test(i32 %0) !dbg !34 {
+define weak_odr void @test(i32 %0) #1 !dbg !34 {
 ; CHECK-LABEL: test:
 ; CHECK:       .Lfunc_begin0:
 ; CHECK-NEXT:    .loc 1 288 0 ; dummy:288:0
@@ -489,22 +489,20 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    v_writelane_b32 v41, s16, 16
 ; CHECK-NEXT:    .cfi_llvm_vector_registers 65, 2601, 16, 32
 ; CHECK-NEXT:    .cfi_def_cfa_register 65
-; CHECK-NEXT:    v_writelane_b32 v41, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v41, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v41, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v41, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v41, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v41, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v41, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v41, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v41, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v41, s49, 9
-; CHECK-NEXT:    v_writelane_b32 v41, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v41, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v41, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v41, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v41, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v41, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v41, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v41, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v41, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v41, s50, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s52, 10
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v41, s53, 13
-; CHECK-NEXT:    v_writelane_b32 v41, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v41, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v41, s54, 12
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- undef
 ; CHECK-NEXT:  .Ltmp0:
@@ -512,10 +510,12 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, __kmpc_alloc_shared at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, __kmpc_alloc_shared at gotpcrel32@hi+12
-; CHECK-NEXT:    v_writelane_b32 v41, s55, 15
+; CHECK-NEXT:    v_writelane_b32 v41, s55, 13
 ; CHECK-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v41, s30, 14
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v41, s31, 15
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -541,23 +541,23 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    .loc 1 0 9 is_stmt 0 ; dummy:0:9
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 14
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
-; CHECK-NEXT:    v_readlane_b32 s55, v41, 15
-; CHECK-NEXT:    v_readlane_b32 s54, v41, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v41, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v41, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v41, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v41, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v41, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v41, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v41, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v41, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v41, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v41, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v41, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v41, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v41, 15
+; CHECK-NEXT:    v_readlane_b32 s55, v41, 13
+; CHECK-NEXT:    v_readlane_b32 s54, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v41, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v41, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v41, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v41, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v41, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v41, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v41, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v41, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v41, 16
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -576,6 +576,7 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 }
 
 attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+attributes #1 = { nounwind }
 
 !llvm.dbg.cu = !{!0, !25, !26}
 !llvm.module.flags = !{!27, !28, !29, !30, !31, !32, !44}
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index d224cfe27b226..fbacc61492674 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -299,8 +299,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
 ; CHECK-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; CHECK-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -339,8 +339,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
 ; CHECK-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; CHECK-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; CHECK-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
index 1269b2d0f7367..2d620a14da405 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
@@ -24,34 +24,34 @@ define void @wobble() #0 {
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    v_mov_b32_e32 v41, 0
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 1
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 3
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; CHECK-NEXT:    v_mov_b32_e32 v42, s9
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 5
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_writelane_b32 v43, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_lshr_b32 s5, s33, 5
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s50, 8
 ; CHECK-NEXT:    s_mov_b32 s50, s15
-; CHECK-NEXT:    v_writelane_b32 v43, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s51, 9
 ; CHECK-NEXT:    s_mov_b32 s51, s14
-; CHECK-NEXT:    v_writelane_b32 v43, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s52, 10
 ; CHECK-NEXT:    s_mov_b32 s52, s13
-; CHECK-NEXT:    v_writelane_b32 v43, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s53, 11
 ; CHECK-NEXT:    s_mov_b32 s53, s12
-; CHECK-NEXT:    v_writelane_b32 v43, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s54, 12
 ; CHECK-NEXT:    s_add_i32 s54, s5, 16
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 13
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 14
 ; CHECK-NEXT:    s_inst_prefetch 0x1
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_1: ; %bb1
@@ -91,21 +91,21 @@ define void @wobble() #0 {
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8
-; CHECK-NEXT:    v_readlane_b32 s54, v43, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 15
 ; CHECK-NEXT:    s_or_saveexec_b32 s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 76a2114a000cf..cba5aa8ef3672 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -55,8 +55,8 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s31, v42, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
+; GCN-NEXT:    v_readlane_b32 s31, v42, 1
 ; GCN-NEXT:    s_mov_b32 s32, s34
 ; GCN-NEXT:    v_readlane_b32 s4, v42, 2
 ; GCN-NEXT:    v_readlane_b32 s34, v42, 3
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 2e88da142bb41..6abe5998d6767 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -26,8 +26,8 @@ define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v40, 0
+; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s32, s33
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v40, 2
 ; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -46,21 +46,14 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0x800
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s30, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; NO-SPILL-TO-VGPR-NEXT:    s_getpc_b64 s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -69,20 +62,12 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v0, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v0, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v0, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index c724404243b74..aff30d682f20a 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -2065,8 +2065,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2095,8 +2095,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2484,8 +2484,8 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -2526,8 +2526,8 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 1d48a5c727fa8..f22ba70d32ae5 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -4,7 +4,7 @@
 
 declare void @extern_c_func()
 
-define amdgpu_gfx void @gfx_func() {
+define amdgpu_gfx void @gfx_func() #0 {
 ; SDAG-LABEL: gfx_func:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39,47 +39,47 @@ define amdgpu_gfx void @gfx_func() {
 ; SDAG-NEXT:    v_writelane_b32 v40, s27, 23
 ; SDAG-NEXT:    v_writelane_b32 v40, s28, 24
 ; SDAG-NEXT:    v_writelane_b32 v40, s29, 25
-; SDAG-NEXT:    v_writelane_b32 v40, s30, 26
-; SDAG-NEXT:    v_writelane_b32 v40, s31, 27
-; SDAG-NEXT:    v_writelane_b32 v40, s72, 28
-; SDAG-NEXT:    v_writelane_b32 v40, s73, 29
-; SDAG-NEXT:    v_writelane_b32 v40, s74, 30
-; SDAG-NEXT:    v_writelane_b32 v40, s75, 31
-; SDAG-NEXT:    v_writelane_b32 v40, s76, 32
-; SDAG-NEXT:    v_writelane_b32 v40, s77, 33
-; SDAG-NEXT:    v_writelane_b32 v40, s78, 34
-; SDAG-NEXT:    v_writelane_b32 v40, s79, 35
-; SDAG-NEXT:    v_writelane_b32 v40, s88, 36
-; SDAG-NEXT:    v_writelane_b32 v40, s89, 37
-; SDAG-NEXT:    v_writelane_b32 v40, s90, 38
-; SDAG-NEXT:    v_writelane_b32 v40, s91, 39
-; SDAG-NEXT:    v_writelane_b32 v40, s92, 40
-; SDAG-NEXT:    v_writelane_b32 v40, s93, 41
-; SDAG-NEXT:    v_writelane_b32 v40, s94, 42
+; SDAG-NEXT:    v_writelane_b32 v40, s72, 26
+; SDAG-NEXT:    v_writelane_b32 v40, s73, 27
+; SDAG-NEXT:    v_writelane_b32 v40, s74, 28
+; SDAG-NEXT:    v_writelane_b32 v40, s75, 29
+; SDAG-NEXT:    v_writelane_b32 v40, s76, 30
+; SDAG-NEXT:    v_writelane_b32 v40, s77, 31
+; SDAG-NEXT:    v_writelane_b32 v40, s78, 32
+; SDAG-NEXT:    v_writelane_b32 v40, s79, 33
+; SDAG-NEXT:    v_writelane_b32 v40, s88, 34
+; SDAG-NEXT:    v_writelane_b32 v40, s89, 35
+; SDAG-NEXT:    v_writelane_b32 v40, s90, 36
+; SDAG-NEXT:    v_writelane_b32 v40, s91, 37
+; SDAG-NEXT:    v_writelane_b32 v40, s92, 38
+; SDAG-NEXT:    v_writelane_b32 v40, s93, 39
+; SDAG-NEXT:    v_writelane_b32 v40, s94, 40
+; SDAG-NEXT:    v_writelane_b32 v40, s95, 41
+; SDAG-NEXT:    v_writelane_b32 v40, s30, 42
 ; SDAG-NEXT:    s_mov_b32 s35, extern_c_func at abs32@hi
 ; SDAG-NEXT:    s_mov_b32 s34, extern_c_func at abs32@lo
 ; SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; SDAG-NEXT:    s_addk_i32 s32, 0x400
-; SDAG-NEXT:    v_writelane_b32 v40, s95, 43
+; SDAG-NEXT:    v_writelane_b32 v40, s31, 43
 ; SDAG-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; SDAG-NEXT:    v_readlane_b32 s95, v40, 43
-; SDAG-NEXT:    v_readlane_b32 s94, v40, 42
-; SDAG-NEXT:    v_readlane_b32 s93, v40, 41
-; SDAG-NEXT:    v_readlane_b32 s92, v40, 40
-; SDAG-NEXT:    v_readlane_b32 s91, v40, 39
-; SDAG-NEXT:    v_readlane_b32 s90, v40, 38
-; SDAG-NEXT:    v_readlane_b32 s89, v40, 37
-; SDAG-NEXT:    v_readlane_b32 s88, v40, 36
-; SDAG-NEXT:    v_readlane_b32 s79, v40, 35
-; SDAG-NEXT:    v_readlane_b32 s78, v40, 34
-; SDAG-NEXT:    v_readlane_b32 s77, v40, 33
-; SDAG-NEXT:    v_readlane_b32 s76, v40, 32
-; SDAG-NEXT:    v_readlane_b32 s75, v40, 31
-; SDAG-NEXT:    v_readlane_b32 s74, v40, 30
-; SDAG-NEXT:    v_readlane_b32 s73, v40, 29
-; SDAG-NEXT:    v_readlane_b32 s72, v40, 28
-; SDAG-NEXT:    v_readlane_b32 s31, v40, 27
-; SDAG-NEXT:    v_readlane_b32 s30, v40, 26
+; SDAG-NEXT:    v_readlane_b32 s30, v40, 42
+; SDAG-NEXT:    v_readlane_b32 s31, v40, 43
+; SDAG-NEXT:    v_readlane_b32 s95, v40, 41
+; SDAG-NEXT:    v_readlane_b32 s94, v40, 40
+; SDAG-NEXT:    v_readlane_b32 s93, v40, 39
+; SDAG-NEXT:    v_readlane_b32 s92, v40, 38
+; SDAG-NEXT:    v_readlane_b32 s91, v40, 37
+; SDAG-NEXT:    v_readlane_b32 s90, v40, 36
+; SDAG-NEXT:    v_readlane_b32 s89, v40, 35
+; SDAG-NEXT:    v_readlane_b32 s88, v40, 34
+; SDAG-NEXT:    v_readlane_b32 s79, v40, 33
+; SDAG-NEXT:    v_readlane_b32 s78, v40, 32
+; SDAG-NEXT:    v_readlane_b32 s77, v40, 31
+; SDAG-NEXT:    v_readlane_b32 s76, v40, 30
+; SDAG-NEXT:    v_readlane_b32 s75, v40, 29
+; SDAG-NEXT:    v_readlane_b32 s74, v40, 28
+; SDAG-NEXT:    v_readlane_b32 s73, v40, 27
+; SDAG-NEXT:    v_readlane_b32 s72, v40, 26
 ; SDAG-NEXT:    v_readlane_b32 s29, v40, 25
 ; SDAG-NEXT:    v_readlane_b32 s28, v40, 24
 ; SDAG-NEXT:    v_readlane_b32 s27, v40, 23
@@ -148,47 +148,47 @@ define amdgpu_gfx void @gfx_func() {
 ; GISEL-NEXT:    v_writelane_b32 v40, s27, 23
 ; GISEL-NEXT:    v_writelane_b32 v40, s28, 24
 ; GISEL-NEXT:    v_writelane_b32 v40, s29, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s72, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s73, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s74, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s75, 31
-; GISEL-NEXT:    v_writelane_b32 v40, s76, 32
-; GISEL-NEXT:    v_writelane_b32 v40, s77, 33
-; GISEL-NEXT:    v_writelane_b32 v40, s78, 34
-; GISEL-NEXT:    v_writelane_b32 v40, s79, 35
-; GISEL-NEXT:    v_writelane_b32 v40, s88, 36
-; GISEL-NEXT:    v_writelane_b32 v40, s89, 37
-; GISEL-NEXT:    v_writelane_b32 v40, s90, 38
-; GISEL-NEXT:    v_writelane_b32 v40, s91, 39
-; GISEL-NEXT:    v_writelane_b32 v40, s92, 40
-; GISEL-NEXT:    v_writelane_b32 v40, s93, 41
-; GISEL-NEXT:    v_writelane_b32 v40, s94, 42
+; GISEL-NEXT:    v_writelane_b32 v40, s72, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s73, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s74, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s75, 29
+; GISEL-NEXT:    v_writelane_b32 v40, s76, 30
+; GISEL-NEXT:    v_writelane_b32 v40, s77, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s78, 32
+; GISEL-NEXT:    v_writelane_b32 v40, s79, 33
+; GISEL-NEXT:    v_writelane_b32 v40, s88, 34
+; GISEL-NEXT:    v_writelane_b32 v40, s89, 35
+; GISEL-NEXT:    v_writelane_b32 v40, s90, 36
+; GISEL-NEXT:    v_writelane_b32 v40, s91, 37
+; GISEL-NEXT:    v_writelane_b32 v40, s92, 38
+; GISEL-NEXT:    v_writelane_b32 v40, s93, 39
+; GISEL-NEXT:    v_writelane_b32 v40, s94, 40
+; GISEL-NEXT:    v_writelane_b32 v40, s95, 41
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 42
 ; GISEL-NEXT:    s_mov_b32 s34, extern_c_func at abs32@lo
 ; GISEL-NEXT:    s_mov_b32 s35, extern_c_func at abs32@hi
 ; GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s95, 43
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 43
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GISEL-NEXT:    v_readlane_b32 s95, v40, 43
-; GISEL-NEXT:    v_readlane_b32 s94, v40, 42
-; GISEL-NEXT:    v_readlane_b32 s93, v40, 41
-; GISEL-NEXT:    v_readlane_b32 s92, v40, 40
-; GISEL-NEXT:    v_readlane_b32 s91, v40, 39
-; GISEL-NEXT:    v_readlane_b32 s90, v40, 38
-; GISEL-NEXT:    v_readlane_b32 s89, v40, 37
-; GISEL-NEXT:    v_readlane_b32 s88, v40, 36
-; GISEL-NEXT:    v_readlane_b32 s79, v40, 35
-; GISEL-NEXT:    v_readlane_b32 s78, v40, 34
-; GISEL-NEXT:    v_readlane_b32 s77, v40, 33
-; GISEL-NEXT:    v_readlane_b32 s76, v40, 32
-; GISEL-NEXT:    v_readlane_b32 s75, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s74, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s73, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s72, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 42
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 43
+; GISEL-NEXT:    v_readlane_b32 s95, v40, 41
+; GISEL-NEXT:    v_readlane_b32 s94, v40, 40
+; GISEL-NEXT:    v_readlane_b32 s93, v40, 39
+; GISEL-NEXT:    v_readlane_b32 s92, v40, 38
+; GISEL-NEXT:    v_readlane_b32 s91, v40, 37
+; GISEL-NEXT:    v_readlane_b32 s90, v40, 36
+; GISEL-NEXT:    v_readlane_b32 s89, v40, 35
+; GISEL-NEXT:    v_readlane_b32 s88, v40, 34
+; GISEL-NEXT:    v_readlane_b32 s79, v40, 33
+; GISEL-NEXT:    v_readlane_b32 s78, v40, 32
+; GISEL-NEXT:    v_readlane_b32 s77, v40, 31
+; GISEL-NEXT:    v_readlane_b32 s76, v40, 30
+; GISEL-NEXT:    v_readlane_b32 s75, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s74, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s73, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s72, v40, 26
 ; GISEL-NEXT:    v_readlane_b32 s29, v40, 25
 ; GISEL-NEXT:    v_readlane_b32 s28, v40, 24
 ; GISEL-NEXT:    v_readlane_b32 s27, v40, 23
@@ -225,3 +225,5 @@ define amdgpu_gfx void @gfx_func() {
   call void @extern_c_func()
   ret void
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index e3a5c408364f5..3f5ad3fc6e347 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -141,8 +141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -170,8 +170,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -200,8 +200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -229,8 +229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -264,8 +264,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -295,8 +295,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -326,8 +326,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -357,8 +357,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -393,8 +393,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -424,8 +424,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -455,8 +455,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    scratch_store_b8 off, v0, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -486,8 +486,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -519,8 +519,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -547,8 +547,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -576,8 +576,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -604,8 +604,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -632,8 +632,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -665,8 +665,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -694,8 +694,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -724,8 +724,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -753,8 +753,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -782,8 +782,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -816,8 +816,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -845,8 +845,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -875,8 +875,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -933,8 +933,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -966,8 +966,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -994,8 +994,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1023,8 +1023,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1051,8 +1051,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1079,8 +1079,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1112,8 +1112,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1141,8 +1141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1171,8 +1171,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1200,8 +1200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1229,8 +1229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1263,8 +1263,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1292,8 +1292,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1322,8 +1322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1351,8 +1351,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1380,8 +1380,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1413,8 +1413,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1441,8 +1441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1470,8 +1470,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1498,8 +1498,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1531,8 +1531,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1560,8 +1560,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1589,8 +1589,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1618,8 +1618,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1652,8 +1652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1682,8 +1682,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1712,8 +1712,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1742,8 +1742,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1778,8 +1778,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1809,8 +1809,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1839,8 +1839,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1870,8 +1870,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1906,8 +1906,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -1938,8 +1938,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -1969,8 +1969,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2001,8 +2001,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2042,8 +2042,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2076,8 +2076,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2108,8 +2108,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2142,8 +2142,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2176,8 +2176,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2204,8 +2204,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2233,8 +2233,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2261,8 +2261,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2289,8 +2289,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2321,8 +2321,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2349,8 +2349,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2378,8 +2378,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2406,8 +2406,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2439,8 +2439,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2468,8 +2468,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2497,8 +2497,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2526,8 +2526,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2560,8 +2560,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2590,8 +2590,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2620,8 +2620,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2650,8 +2650,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2686,8 +2686,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2718,8 +2718,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2749,8 +2749,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2781,8 +2781,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2814,8 +2814,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2843,8 +2843,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2872,8 +2872,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2901,8 +2901,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -2936,8 +2936,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -2967,8 +2967,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -2997,8 +2997,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3028,8 +3028,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3065,8 +3065,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3098,8 +3098,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3129,8 +3129,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3162,8 +3162,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3199,8 +3199,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3232,8 +3232,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3265,8 +3265,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3297,8 +3297,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3330,8 +3330,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3368,8 +3368,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3401,8 +3401,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3433,8 +3433,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3466,8 +3466,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3505,8 +3505,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3539,8 +3539,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3572,8 +3572,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3606,8 +3606,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3647,8 +3647,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3683,8 +3683,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3718,8 +3718,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3754,8 +3754,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3798,8 +3798,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3837,8 +3837,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -3874,8 +3874,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3913,8 +3913,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -3989,8 +3989,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX9-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4060,8 +4060,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX10-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -4126,8 +4126,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
 ; GFX11-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4197,8 +4197,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v18, v33
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v19, v34
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4239,8 +4239,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4275,8 +4275,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -4311,8 +4311,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4346,8 +4346,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4382,8 +4382,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4430,8 +4430,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4466,8 +4466,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4509,8 +4509,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
@@ -4552,8 +4552,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -4593,8 +4593,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4649,8 +4649,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4687,8 +4687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    global_store_byte v[3:4], v2, off
 ; GFX10-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
@@ -4730,6 +4730,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, 2 :: v_dual_mov_b32 v4, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[3:4], v2, off
 ; GFX11-TRUE16-NEXT:    global_store_b16 v[40:41], v0, off
@@ -4737,7 +4738,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -4773,8 +4773,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, 2 :: v_dual_mov_b32 v4, 0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
 ; GFX11-FAKE16-NEXT:    global_store_b8 v[3:4], v2, off
 ; GFX11-FAKE16-NEXT:    global_store_b16 v[40:41], v0, off
@@ -4817,8 +4817,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[3:4], v2, off
 ; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
@@ -4873,8 +4873,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -4911,8 +4911,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-NEXT:    v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
@@ -4955,10 +4955,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    global_store_b32 v[40:41], v0, off
@@ -4999,8 +4999,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
@@ -5042,8 +5042,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
@@ -5103,8 +5103,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5145,8 +5145,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
@@ -5189,10 +5189,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v5, v0, v1, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v4, off
@@ -5201,7 +5202,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -5241,8 +5241,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v0, v1, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
@@ -5290,8 +5290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
@@ -5355,8 +5355,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5400,8 +5400,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    v_perm_b32 v5, v6, v7, 0xc0c0004
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
@@ -5449,18 +5449,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v4, v4, v5, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v5, v6, v7, 0xc0c0004
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX11-TRUE16-NEXT:    global_store_b64 v[40:41], v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -5504,8 +5504,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
@@ -5555,8 +5555,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v5, v6, v7, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
@@ -5675,8 +5675,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -5781,8 +5781,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -5884,8 +5884,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -5990,8 +5990,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:8
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6026,8 +6026,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6054,8 +6054,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6083,8 +6083,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6111,8 +6111,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6144,8 +6144,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6172,8 +6172,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6201,8 +6201,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6229,8 +6229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6262,8 +6262,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6290,8 +6290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6319,8 +6319,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6347,8 +6347,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6381,8 +6381,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6410,8 +6410,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6439,8 +6439,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6468,8 +6468,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6501,8 +6501,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6530,8 +6530,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6560,8 +6560,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6589,8 +6589,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6621,8 +6621,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6649,8 +6649,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6678,8 +6678,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6706,8 +6706,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6740,8 +6740,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6769,8 +6769,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6799,8 +6799,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6828,8 +6828,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6860,8 +6860,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -6888,8 +6888,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -6917,8 +6917,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6945,8 +6945,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -6978,8 +6978,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7006,8 +7006,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7035,8 +7035,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7063,8 +7063,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7097,8 +7097,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7126,8 +7126,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7155,8 +7155,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7184,8 +7184,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7218,8 +7218,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7248,8 +7248,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7278,8 +7278,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7308,8 +7308,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7343,8 +7343,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7374,8 +7374,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7404,8 +7404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7435,8 +7435,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7467,8 +7467,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7495,8 +7495,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7524,8 +7524,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7552,8 +7552,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7588,8 +7588,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7619,8 +7619,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7649,8 +7649,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7680,8 +7680,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7716,8 +7716,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7748,8 +7748,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7779,8 +7779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7811,8 +7811,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7847,8 +7847,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -7880,8 +7880,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -7914,8 +7914,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7947,8 +7947,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -7988,8 +7988,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8023,8 +8023,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8055,8 +8055,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8090,8 +8090,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8128,8 +8128,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8163,8 +8163,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8199,8 +8199,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8234,8 +8234,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8279,8 +8279,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8318,8 +8318,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8358,8 +8358,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8397,8 +8397,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8445,8 +8445,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8487,8 +8487,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8529,8 +8529,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b32 off, v32, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8571,8 +8571,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8614,8 +8614,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8651,8 +8651,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8688,8 +8688,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8725,8 +8725,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8762,8 +8762,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8795,8 +8795,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8829,8 +8829,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8862,8 +8862,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8895,8 +8895,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -8933,8 +8933,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -8965,8 +8965,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -8999,8 +8999,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9031,8 +9031,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9063,8 +9063,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9108,8 +9108,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -9150,8 +9150,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -9192,8 +9192,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s33 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -9232,8 +9232,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
 ; GFX11-FAKE16-NEXT:    scratch_load_u8 v0, off, s33 offset:8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
@@ -9274,8 +9274,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_ubyte v0, off, s33 offset:8
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v1, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
@@ -9346,8 +9346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -9397,8 +9397,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9445,8 +9445,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v1, v16
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v17 :: v_dual_mov_b32 v3, v18
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9496,8 +9496,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9525,46 +9525,46 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:16
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:20
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s53, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 13
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s54, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 14
 ; GFX9-NEXT:    s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
-; GFX9-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9586,7 +9586,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:16
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:20
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
 ; GFX10-NEXT:    s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
@@ -9594,38 +9594,38 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX10-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX10-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX10-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX10-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX10-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9646,44 +9646,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b64 v[32:33], off, s33 offset:16
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s33
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
 ; GFX11-NEXT:    s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX11-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX11-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX11-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX11-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX11-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX11-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX11-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX11-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX11-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX11-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX11-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX11-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX11-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX11-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX11-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX11-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX11-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -9704,44 +9704,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33 offset:16
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v31, off, s33
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -9775,8 +9775,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -9804,8 +9804,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -9834,8 +9834,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9863,8 +9863,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -9896,8 +9896,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -9926,8 +9926,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -9957,8 +9957,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -9987,8 +9987,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10021,8 +10021,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10051,8 +10051,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10082,8 +10082,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10112,8 +10112,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10146,8 +10146,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10176,8 +10176,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -10207,8 +10207,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10237,8 +10237,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -10273,8 +10273,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -10306,8 +10306,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -10340,8 +10340,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -10373,8 +10373,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -10412,8 +10412,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10449,8 +10449,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10487,8 +10487,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10524,8 +10524,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10568,8 +10568,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10607,8 +10607,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10647,8 +10647,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10686,8 +10686,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -10731,8 +10731,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
@@ -10774,8 +10774,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
@@ -10818,8 +10818,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
@@ -10861,8 +10861,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
@@ -10915,8 +10915,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -10964,8 +10964,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11014,8 +11014,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11063,8 +11063,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -11106,8 +11106,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11136,8 +11136,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11167,8 +11167,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11197,8 +11197,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11231,8 +11231,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11261,8 +11261,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -11292,8 +11292,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11322,8 +11322,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -11358,8 +11358,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -11391,8 +11391,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -11425,8 +11425,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -11458,8 +11458,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -11497,8 +11497,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11533,8 +11533,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11570,8 +11570,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11606,8 +11606,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
@@ -11650,8 +11650,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
@@ -11692,8 +11692,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
@@ -11735,8 +11735,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
@@ -11777,8 +11777,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
@@ -11817,8 +11817,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -11850,8 +11850,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -11884,8 +11884,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -11917,8 +11917,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -11958,8 +11958,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -11997,8 +11997,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12037,8 +12037,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12076,8 +12076,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -12123,8 +12123,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12168,8 +12168,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12214,8 +12214,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12259,8 +12259,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
@@ -12298,8 +12298,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -12328,8 +12328,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -12359,8 +12359,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -12389,8 +12389,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -12425,8 +12425,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12457,8 +12457,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12490,8 +12490,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12522,8 +12522,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12559,8 +12559,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12591,8 +12591,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12624,8 +12624,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12656,8 +12656,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12694,8 +12694,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12727,8 +12727,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12761,8 +12761,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12794,8 +12794,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12831,8 +12831,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12864,8 +12864,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -12898,8 +12898,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -12931,8 +12931,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -12967,8 +12967,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -12999,8 +12999,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13032,8 +13032,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13064,8 +13064,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13102,8 +13102,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13135,8 +13135,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13169,8 +13169,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13202,8 +13202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13237,8 +13237,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -13267,8 +13267,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -13298,8 +13298,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -13328,8 +13328,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
@@ -13364,8 +13364,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13396,8 +13396,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13429,8 +13429,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13461,8 +13461,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13499,8 +13499,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -13532,8 +13532,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -13566,8 +13566,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -13599,8 +13599,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
@@ -13638,8 +13638,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13674,8 +13674,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13711,8 +13711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13747,8 +13747,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
@@ -13789,8 +13789,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13828,8 +13828,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13868,8 +13868,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13907,8 +13907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13947,8 +13947,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -13983,8 +13983,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14020,8 +14020,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14056,8 +14056,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14100,8 +14100,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14139,8 +14139,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14179,8 +14179,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14218,8 +14218,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -14263,8 +14263,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14305,8 +14305,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14348,8 +14348,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14390,8 +14390,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
@@ -14437,8 +14437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14483,8 +14483,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14530,8 +14530,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14576,8 +14576,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14633,8 +14633,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14684,8 +14684,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14736,8 +14736,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14787,8 +14787,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -14845,8 +14845,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
@@ -14907,8 +14907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
@@ -14970,8 +14970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX11-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX11-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX11-NEXT:    v_readlane_b32 s17, v40, 13
@@ -15032,8 +15032,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
@@ -15134,8 +15134,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15241,8 +15241,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15344,8 +15344,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX11-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX11-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15448,8 +15448,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15565,8 +15565,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15677,8 +15677,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15784,8 +15784,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX11-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX11-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15894,8 +15894,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
@@ -15960,8 +15960,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -15994,8 +15994,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16024,8 +16024,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16054,8 +16054,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16126,8 +16126,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16193,8 +16193,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_12xv3i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16240,8 +16240,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16304,8 +16304,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_12xv3i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16396,8 +16396,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 7
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16471,8 +16471,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_8xv5i32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16522,8 +16522,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16592,8 +16592,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_8xv5i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16680,8 +16680,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16755,8 +16755,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_8xv5f32 at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16811,8 +16811,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16881,8 +16881,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_8xv5f32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -16921,8 +16921,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -16948,8 +16948,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -16976,8 +16976,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17003,8 +17003,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17035,8 +17035,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17062,8 +17062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17090,8 +17090,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17117,8 +17117,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17149,8 +17149,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17176,8 +17176,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17204,8 +17204,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17231,8 +17231,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17263,8 +17263,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17290,8 +17290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17318,8 +17318,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17345,8 +17345,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17377,8 +17377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17404,8 +17404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17432,8 +17432,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17459,8 +17459,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17491,8 +17491,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17518,8 +17518,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17546,8 +17546,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17573,8 +17573,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17605,8 +17605,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17632,8 +17632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17660,8 +17660,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17687,8 +17687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17719,8 +17719,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17746,8 +17746,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17774,8 +17774,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17801,8 +17801,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17833,8 +17833,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17860,8 +17860,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -17888,8 +17888,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17915,8 +17915,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -17947,8 +17947,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -17974,8 +17974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18002,8 +18002,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18029,8 +18029,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18061,8 +18061,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18088,8 +18088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18116,8 +18116,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18143,8 +18143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18175,8 +18175,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18202,8 +18202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18230,8 +18230,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18257,8 +18257,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18289,8 +18289,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18316,8 +18316,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18344,8 +18344,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18371,8 +18371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18403,8 +18403,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -18430,8 +18430,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -18458,8 +18458,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -18485,8 +18485,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 0005e8a2619b2..260398a519660 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -26,8 +26,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
@@ -60,8 +60,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
@@ -95,8 +95,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
@@ -130,8 +130,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; clobber
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX9-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX9-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
@@ -157,8 +157,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; clobber
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX10-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX10-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
@@ -185,8 +185,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX11-NEXT:    ; clobber
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX11-NEXT:    v_readlane_b32 s30, v0, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v0, 3
 ; GFX11-NEXT:    v_readlane_b32 s29, v0, 1
 ; GFX11-NEXT:    v_readlane_b32 s28, v0, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -224,8 +224,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -261,8 +261,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -298,8 +298,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s31
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -341,8 +341,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX9-NEXT:    ; use v31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -378,8 +378,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX10-NEXT:    ; use v31
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -416,8 +416,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX11-NEXT:    ; use v31
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -455,11 +455,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s33
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -492,11 +492,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_mov_b32 s33, s4
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s33
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -529,12 +529,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s33, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s33
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -572,11 +572,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s34, s4
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s34
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -609,11 +609,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_mov_b32 s34, s4
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s34
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -645,13 +645,13 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s34, s4
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s34
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -691,8 +691,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -726,8 +726,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -761,8 +761,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX11-NEXT:    ; use v40
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -849,8 +849,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -876,8 +876,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -934,8 +934,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -961,8 +961,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
@@ -989,8 +989,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -1024,11 +1024,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_mov_b32 s4, s40
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
@@ -1060,11 +1060,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
@@ -1096,12 +1096,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s4
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
@@ -1150,8 +1150,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 3
@@ -1195,8 +1195,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 3
@@ -1240,8 +1240,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    ; use v40
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX11-NEXT:    v_readlane_b32 s4, v41, 0
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 3
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c06011c259f9b..0b54bbd7e2105 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -34,8 +34,8 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -59,8 +59,8 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -85,8 +85,8 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -136,8 +136,8 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -161,8 +161,8 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -187,8 +187,8 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -232,8 +232,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -257,8 +257,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -283,8 +283,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v1, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -336,8 +336,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -361,8 +361,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -387,8 +387,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -750,8 +750,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -840,8 +840,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:116
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:120
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:124
-; GFX10-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -931,8 +931,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:124
-; GFX11-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload
@@ -2151,8 +2151,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x60000
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s34
 ; GFX9-NEXT:    s_mov_b32 s34, s38
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[36:37], -1
@@ -2181,8 +2181,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x30000
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s34
 ; GFX10-NEXT:    s_mov_b32 s34, s38
 ; GFX10-NEXT:    s_xor_saveexec_b32 s36, -1
@@ -2213,8 +2213,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s34
 ; GFX11-NEXT:    s_mov_b32 s34, s36
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
@@ -2889,8 +2889,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s34
 ; GFX9-NEXT:    s_mov_b32 s34, s38
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
@@ -3167,8 +3167,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56
-; GFX10-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s34
 ; GFX10-NEXT:    s_mov_b32 s34, s38
 ; GFX10-NEXT:    s_or_saveexec_b32 s36, -1
@@ -3347,8 +3347,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:44
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:48
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:52
-; GFX11-NEXT:    v_readlane_b32 s31, v62, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v62, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v62, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s34
 ; GFX11-NEXT:    s_mov_b32 s34, s39
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll
index d8df20eb69452..e5d070d8a3455 100644
--- a/llvm/test/CodeGen/AMDGPU/global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll
@@ -4,7 +4,7 @@
 @foo_a = alias void (ptr), ptr @foo
 @bar_a = alias void (ptr), ptr @foo_a
 
-define void @foo() {
+define void @foo() #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13,7 +13,7 @@ entry:
   ret void
 }
 
-define void @bar() {
+define void @bar() #0 {
 ; CHECK-LABEL: bar:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35,8 +35,8 @@ define void @bar() {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -54,3 +54,5 @@ entry:
 ; CHECK: .set foo_a, foo
 ; CHECK: .set bar_a, foo_a
 ; UTC_ARGS: --enable
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 76f204dd0c16a..e1f6906a89c29 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -9,28 +9,30 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v6, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v6, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v6, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v6, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v6, s48, 6
-; CHECK-NEXT:    v_writelane_b32 v6, s49, 7
-; CHECK-NEXT:    v_writelane_b32 v6, s50, 8
-; CHECK-NEXT:    v_writelane_b32 v6, s51, 9
-; CHECK-NEXT:    v_writelane_b32 v6, s52, 10
-; CHECK-NEXT:    v_writelane_b32 v6, s53, 11
-; CHECK-NEXT:    v_writelane_b32 v6, s54, 12
-; CHECK-NEXT:    v_writelane_b32 v6, s55, 13
-; CHECK-NEXT:    v_writelane_b32 v6, s64, 14
-; CHECK-NEXT:    v_writelane_b32 v6, s65, 15
-; CHECK-NEXT:    v_writelane_b32 v6, s66, 16
-; CHECK-NEXT:    v_writelane_b32 v6, s67, 17
-; CHECK-NEXT:    v_writelane_b32 v6, s68, 18
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 4
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 5
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 6
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s52, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s53, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s54, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s55, 11
+; CHECK-NEXT:    v_writelane_b32 v6, s64, 12
+; CHECK-NEXT:    v_writelane_b32 v6, s65, 13
+; CHECK-NEXT:    v_writelane_b32 v6, s66, 14
+; CHECK-NEXT:    v_writelane_b32 v6, s67, 15
+; CHECK-NEXT:    v_writelane_b32 v6, s68, 16
+; CHECK-NEXT:    v_writelane_b32 v6, s69, 17
+; CHECK-NEXT:    v_writelane_b32 v6, s70, 18
+; CHECK-NEXT:    v_writelane_b32 v6, s71, 19
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 20
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:    v_writelane_b32 v6, s69, 19
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 21
 ; CHECK-NEXT:    s_mov_b32 s68, 0
 ; CHECK-NEXT:    s_mov_b32 s69, s4
 ; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
@@ -40,11 +42,11 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[68:69], 0x130
 ; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v6, s70, 20
-; CHECK-NEXT:    v_writelane_b32 v6, s71, 21
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[68:69], 0x2f0
+; CHECK-NEXT:    s_mov_b32 s70, s68
 ; CHECK-NEXT:    v_writelane_b32 v7, s8, 0
 ; CHECK-NEXT:    v_writelane_b32 v7, s9, 1
 ; CHECK-NEXT:    v_writelane_b32 v7, s10, 2
@@ -77,9 +79,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v7, s65, 29
 ; CHECK-NEXT:    v_writelane_b32 v7, s66, 30
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[68:69], 0x1f0
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[68:69], 0x2f0
 ; CHECK-NEXT:    s_mov_b32 s69, s68
-; CHECK-NEXT:    s_mov_b32 s70, s68
 ; CHECK-NEXT:    s_mov_b32 s71, s68
 ; CHECK-NEXT:    v_writelane_b32 v7, s67, 31
 ; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
@@ -225,29 +225,29 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT:    v_readlane_b32 s71, v6, 21
-; CHECK-NEXT:    v_readlane_b32 s70, v6, 20
-; CHECK-NEXT:    v_readlane_b32 s69, v6, 19
-; CHECK-NEXT:    v_readlane_b32 s68, v6, 18
-; CHECK-NEXT:    v_readlane_b32 s67, v6, 17
-; CHECK-NEXT:    v_readlane_b32 s66, v6, 16
-; CHECK-NEXT:    v_readlane_b32 s65, v6, 15
-; CHECK-NEXT:    v_readlane_b32 s64, v6, 14
-; CHECK-NEXT:    v_readlane_b32 s55, v6, 13
-; CHECK-NEXT:    v_readlane_b32 s54, v6, 12
-; CHECK-NEXT:    v_readlane_b32 s53, v6, 11
-; CHECK-NEXT:    v_readlane_b32 s52, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 20
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 21
+; CHECK-NEXT:    v_readlane_b32 s71, v6, 19
+; CHECK-NEXT:    v_readlane_b32 s70, v6, 18
+; CHECK-NEXT:    v_readlane_b32 s69, v6, 17
+; CHECK-NEXT:    v_readlane_b32 s68, v6, 16
+; CHECK-NEXT:    v_readlane_b32 s67, v6, 15
+; CHECK-NEXT:    v_readlane_b32 s66, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s65, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s64, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s55, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s54, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s53, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s52, v6, 8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s51, v6, 9
-; CHECK-NEXT:    v_readlane_b32 s50, v6, 8
-; CHECK-NEXT:    v_readlane_b32 s49, v6, 7
-; CHECK-NEXT:    v_readlane_b32 s48, v6, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v6, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v6, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v6, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v6, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 7
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -336,7 +336,7 @@ declare <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 immarg, float,
 declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
 declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
 
-attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) }
 attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 5ded0824adc90..ce4230bb9f817 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -5,7 +5,7 @@
 @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 @gv.fptr1 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 
-define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
+define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) #0 {
 ; GCN-LABEL: test_indirect_call_sgpr_ptr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
@@ -60,7 +60,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
   ret void
 }
 
-define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
+define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) #0 {
 ; GCN-LABEL: test_indirect_call_sgpr_ptr_arg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
@@ -117,7 +117,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
   ret void
 }
 
-define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
+define void @test_indirect_call_vgpr_ptr(ptr %fptr) #0 {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -128,24 +128,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -175,24 +175,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB2_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -212,24 +212,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -259,24 +259,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB2_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -289,7 +289,7 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
   ret void
 }
 
-define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
+define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) #0 {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -300,24 +300,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -350,24 +350,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB3_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -387,24 +387,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -435,24 +435,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB3_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -465,7 +465,7 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
   ret void
 }
 
-define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
+define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) #0 {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_ret:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -476,24 +476,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 18
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s30, 16
+; GCN-NEXT:    v_writelane_b32 v40, s31, 17
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -525,24 +525,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_readlane_b32 s31, v40, 17
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -562,24 +562,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 18
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 17
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -611,24 +611,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -642,7 +642,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
   ret i32 %b
 }
 
-define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
+define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) #0 {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_in_branch:
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -653,26 +653,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 20
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
-; GCN-NEXT:    v_writelane_b32 v40, s64, 16
-; GCN-NEXT:    v_writelane_b32 v40, s65, 17
-; GCN-NEXT:    v_writelane_b32 v40, s66, 18
-; GCN-NEXT:    v_writelane_b32 v40, s67, 19
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s64, 14
+; GCN-NEXT:    v_writelane_b32 v40, s65, 15
+; GCN-NEXT:    v_writelane_b32 v40, s66, 16
+; GCN-NEXT:    v_writelane_b32 v40, s67, 17
+; GCN-NEXT:    v_writelane_b32 v40, s30, 18
+; GCN-NEXT:    v_writelane_b32 v40, s31, 19
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -709,26 +709,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 exec, s[64:65]
 ; GCN-NEXT:  .LBB5_4: ; %bb2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[54:55]
-; GCN-NEXT:    v_readlane_b32 s67, v40, 19
-; GCN-NEXT:    v_readlane_b32 s66, v40, 18
-; GCN-NEXT:    v_readlane_b32 s65, v40, 17
-; GCN-NEXT:    v_readlane_b32 s64, v40, 16
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 18
+; GCN-NEXT:    v_readlane_b32 s31, v40, 19
+; GCN-NEXT:    v_readlane_b32 s67, v40, 17
+; GCN-NEXT:    v_readlane_b32 s66, v40, 16
+; GCN-NEXT:    v_readlane_b32 s65, v40, 15
+; GCN-NEXT:    v_readlane_b32 s64, v40, 14
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 20
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -748,26 +748,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v40, s16, 20
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s64, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s65, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s66, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s67, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s64, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s65, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s66, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s67, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 19
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -804,26 +804,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[64:65]
 ; GISEL-NEXT:  .LBB5_4: ; %bb2
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[54:55]
-; GISEL-NEXT:    v_readlane_b32 s67, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s66, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s65, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s64, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s67, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s66, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 20
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -843,7 +843,7 @@ bb2:
   ret void
 }
 
-define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
+define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) #0 {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -853,22 +853,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
@@ -882,22 +882,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB6_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -915,22 +915,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
@@ -946,22 +946,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB6_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -973,7 +973,7 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
   ret void
 }
 
-define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
+define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) #0 {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -984,22 +984,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v41, s30, 0
-; GCN-NEXT:    v_writelane_b32 v41, s31, 1
-; GCN-NEXT:    v_writelane_b32 v41, s34, 2
-; GCN-NEXT:    v_writelane_b32 v41, s35, 3
-; GCN-NEXT:    v_writelane_b32 v41, s36, 4
-; GCN-NEXT:    v_writelane_b32 v41, s37, 5
-; GCN-NEXT:    v_writelane_b32 v41, s38, 6
-; GCN-NEXT:    v_writelane_b32 v41, s39, 7
-; GCN-NEXT:    v_writelane_b32 v41, s48, 8
-; GCN-NEXT:    v_writelane_b32 v41, s49, 9
-; GCN-NEXT:    v_writelane_b32 v41, s50, 10
-; GCN-NEXT:    v_writelane_b32 v41, s51, 11
-; GCN-NEXT:    v_writelane_b32 v41, s52, 12
-; GCN-NEXT:    v_writelane_b32 v41, s53, 13
-; GCN-NEXT:    v_writelane_b32 v41, s54, 14
-; GCN-NEXT:    v_writelane_b32 v41, s55, 15
+; GCN-NEXT:    v_writelane_b32 v41, s34, 0
+; GCN-NEXT:    v_writelane_b32 v41, s35, 1
+; GCN-NEXT:    v_writelane_b32 v41, s36, 2
+; GCN-NEXT:    v_writelane_b32 v41, s37, 3
+; GCN-NEXT:    v_writelane_b32 v41, s38, 4
+; GCN-NEXT:    v_writelane_b32 v41, s39, 5
+; GCN-NEXT:    v_writelane_b32 v41, s48, 6
+; GCN-NEXT:    v_writelane_b32 v41, s49, 7
+; GCN-NEXT:    v_writelane_b32 v41, s50, 8
+; GCN-NEXT:    v_writelane_b32 v41, s51, 9
+; GCN-NEXT:    v_writelane_b32 v41, s52, 10
+; GCN-NEXT:    v_writelane_b32 v41, s53, 11
+; GCN-NEXT:    v_writelane_b32 v41, s54, 12
+; GCN-NEXT:    v_writelane_b32 v41, s55, 13
+; GCN-NEXT:    v_writelane_b32 v41, s30, 14
+; GCN-NEXT:    v_writelane_b32 v41, s31, 15
 ; GCN-NEXT:    v_mov_b32_e32 v40, v0
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1015,22 +1015,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v40
-; GCN-NEXT:    v_readlane_b32 s55, v41, 15
-; GCN-NEXT:    v_readlane_b32 s54, v41, 14
-; GCN-NEXT:    v_readlane_b32 s53, v41, 13
-; GCN-NEXT:    v_readlane_b32 s52, v41, 12
-; GCN-NEXT:    v_readlane_b32 s51, v41, 11
-; GCN-NEXT:    v_readlane_b32 s50, v41, 10
-; GCN-NEXT:    v_readlane_b32 s49, v41, 9
-; GCN-NEXT:    v_readlane_b32 s48, v41, 8
-; GCN-NEXT:    v_readlane_b32 s39, v41, 7
-; GCN-NEXT:    v_readlane_b32 s38, v41, 6
-; GCN-NEXT:    v_readlane_b32 s37, v41, 5
-; GCN-NEXT:    v_readlane_b32 s36, v41, 4
-; GCN-NEXT:    v_readlane_b32 s35, v41, 3
-; GCN-NEXT:    v_readlane_b32 s34, v41, 2
-; GCN-NEXT:    v_readlane_b32 s31, v41, 1
-; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    v_readlane_b32 s30, v41, 14
+; GCN-NEXT:    v_readlane_b32 s31, v41, 15
+; GCN-NEXT:    v_readlane_b32 s55, v41, 13
+; GCN-NEXT:    v_readlane_b32 s54, v41, 12
+; GCN-NEXT:    v_readlane_b32 s53, v41, 11
+; GCN-NEXT:    v_readlane_b32 s52, v41, 10
+; GCN-NEXT:    v_readlane_b32 s51, v41, 9
+; GCN-NEXT:    v_readlane_b32 s50, v41, 8
+; GCN-NEXT:    v_readlane_b32 s49, v41, 7
+; GCN-NEXT:    v_readlane_b32 s48, v41, 6
+; GCN-NEXT:    v_readlane_b32 s39, v41, 5
+; GCN-NEXT:    v_readlane_b32 s38, v41, 4
+; GCN-NEXT:    v_readlane_b32 s37, v41, 3
+; GCN-NEXT:    v_readlane_b32 s36, v41, 2
+; GCN-NEXT:    v_readlane_b32 s35, v41, 1
+; GCN-NEXT:    v_readlane_b32 s34, v41, 0
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -1050,22 +1050,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    v_writelane_b32 v41, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v41, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v41, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v41, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v41, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v41, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v41, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v41, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v41, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v41, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v41, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v41, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v41, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v41, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v41, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v41, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v41, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v41, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v41, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v41, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v41, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v41, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v41, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v41, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v41, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v41, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v41, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v41, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v41, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v41, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v41, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v41, s31, 15
 ; GISEL-NEXT:    v_mov_b32_e32 v40, v0
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1081,22 +1081,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v40
-; GISEL-NEXT:    v_readlane_b32 s55, v41, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v41, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v41, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v41, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v41, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v41, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v41, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v41, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v41, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v41, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v41, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v41, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v41, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v41, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v41, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v41, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v41, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v41, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v41, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v41, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v41, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v41, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v41, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v41, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v41, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v41, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v41, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v41, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v41, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v41, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v41, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v41, 0
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -1113,7 +1113,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; TODO The argument and return variable could be in the same physical register, but the register
 ; allocator is not able to do that because the return value clashes with the liverange of an
 ; IMPLICIT_DEF of the argument.
-define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
+define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) #0 {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1123,22 +1123,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
@@ -1154,22 +1154,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1187,22 +1187,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v1
@@ -1218,22 +1218,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v2
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1246,7 +1246,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 }
 
 ; Calling a vgpr can never be a tail call.
-define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
+define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) #0 {
 ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1256,22 +1256,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s48, 8
-; GCN-NEXT:    v_writelane_b32 v40, s49, 9
-; GCN-NEXT:    v_writelane_b32 v40, s50, 10
-; GCN-NEXT:    v_writelane_b32 v40, s51, 11
-; GCN-NEXT:    v_writelane_b32 v40, s52, 12
-; GCN-NEXT:    v_writelane_b32 v40, s53, 13
-; GCN-NEXT:    v_writelane_b32 v40, s54, 14
-; GCN-NEXT:    v_writelane_b32 v40, s55, 15
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s48, 6
+; GCN-NEXT:    v_writelane_b32 v40, s49, 7
+; GCN-NEXT:    v_writelane_b32 v40, s50, 8
+; GCN-NEXT:    v_writelane_b32 v40, s51, 9
+; GCN-NEXT:    v_writelane_b32 v40, s52, 10
+; GCN-NEXT:    v_writelane_b32 v40, s53, 11
+; GCN-NEXT:    v_writelane_b32 v40, s54, 12
+; GCN-NEXT:    v_writelane_b32 v40, s55, 13
+; GCN-NEXT:    v_writelane_b32 v40, s30, 14
+; GCN-NEXT:    v_writelane_b32 v40, s31, 15
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1284,22 +1284,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB9_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_readlane_b32 s55, v40, 15
-; GCN-NEXT:    v_readlane_b32 s54, v40, 14
-; GCN-NEXT:    v_readlane_b32 s53, v40, 13
-; GCN-NEXT:    v_readlane_b32 s52, v40, 12
-; GCN-NEXT:    v_readlane_b32 s51, v40, 11
-; GCN-NEXT:    v_readlane_b32 s50, v40, 10
-; GCN-NEXT:    v_readlane_b32 s49, v40, 9
-; GCN-NEXT:    v_readlane_b32 s48, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_readlane_b32 s31, v40, 15
+; GCN-NEXT:    v_readlane_b32 s55, v40, 13
+; GCN-NEXT:    v_readlane_b32 s54, v40, 12
+; GCN-NEXT:    v_readlane_b32 s53, v40, 11
+; GCN-NEXT:    v_readlane_b32 s52, v40, 10
+; GCN-NEXT:    v_readlane_b32 s51, v40, 9
+; GCN-NEXT:    v_readlane_b32 s50, v40, 8
+; GCN-NEXT:    v_readlane_b32 s49, v40, 7
+; GCN-NEXT:    v_readlane_b32 s48, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1317,22 +1317,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 15
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1345,22 +1345,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB9_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1372,5 +1372,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
   ret void
 }
 
+attributes #0 = { nounwind }
+
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 7d8a0b70d5f55..0191592c393ce 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -38,8 +38,8 @@ define void @f0() {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
@@ -257,4 +257,4 @@ bb43:
   unreachable
 }
 
-attributes #0 = { noinline optnone }
+attributes #0 = { noinline optnone nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index ad2258cf0bbfe..1ece1dc7e6898 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -7,13 +7,13 @@ define fastcc i32 @foo() #0 {
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_WAITCNT .Vmcnt_0_Expcnt_0_Lgkmcnt_0
   ; CHECK-NEXT:   $sgpr16 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 $sgpr32
   ; CHECK-NEXT:   $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr17
   ; CHECK-NEXT:   $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
@@ -26,8 +26,8 @@ define fastcc i32 @foo() #0 {
   ; CHECK-NEXT:   BUFFER_GL1_INV implicit $exec
   ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
   ; CHECK-NEXT:   S_WAITCNT .Lgkmcnt_0
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $vcc_lo = S_MOV_B32 $exec_lo
@@ -39,12 +39,12 @@ define fastcc i32 @foo() #0 {
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.DummyReturnBlock:
+  ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0, implicit-def $sgpr30_sgpr31
   ; CHECK-NEXT:   $sgpr31 = V_READLANE_B32 $vgpr40, 1
-  ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0
   ; CHECK-NEXT:   $sgpr32 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr40, 2
   ; CHECK-NEXT:   $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" load (s32) from %stack.2, addrspace 5)
+  ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" load (s32) from %stack.1, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr5
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 killed $sgpr4
   ; CHECK-NEXT:   S_WAITCNT .Vmcnt_0
diff --git a/llvm/test/CodeGen/AMDGPU/issue176578.ll b/llvm/test/CodeGen/AMDGPU/issue176578.ll
index 08986d1f61efd..22c1307c779ee 100644
--- a/llvm/test/CodeGen/AMDGPU/issue176578.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue176578.ll
@@ -18,22 +18,21 @@ define <4 x i8> @issue176578() #0 {
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v41, s16, 15
-; CHECK-NEXT:    v_writelane_b32 v41, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v41, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v41, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v41, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v41, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v41, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v41, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v41, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v41, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v41, s49, 9
-; CHECK-NEXT:    v_writelane_b32 v41, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v41, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v41, s52, 12
-; CHECK-NEXT:    v_writelane_b32 v41, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v41, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v41, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v41, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v41, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v41, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v41, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v41, s50, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v41, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v41, s54, 12
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v41, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v41, s30, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
@@ -45,6 +44,7 @@ define <4 x i8> @issue176578() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s54, 0
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v41, s31, 14
 ; CHECK-NEXT:    s_branch .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
@@ -81,25 +81,25 @@ define <4 x i8> @issue176578() #0 {
 ; CHECK-NEXT:    s_branch .LBB0_1
 ; CHECK-NEXT:  .LBB0_4: ; %bb4
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s54
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_readlane_b32 s54, v41, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v41, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v41, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v41, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v41, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v41, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v41, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v41, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v41, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v41, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v41, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v41, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v41, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v41, 14
+; CHECK-NEXT:    v_readlane_b32 s54, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v41, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v41, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v41, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v41, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v41, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v41, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v41, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v41, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v41, 15
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 97a89ec819bae..3a4684293ccc0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -13,7 +13,7 @@
 ; --------------------------------------------------------------------
 
 ; fp8 x fp8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26,7 +26,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39,7 +39,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +52,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -65,7 +65,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -78,7 +78,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -91,7 +91,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -104,7 +104,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -118,7 +118,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -132,7 +132,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
 }
 
 ; fp8 x bf8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -146,7 +146,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -160,7 +160,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
 }
 
 ; fp8 x fp6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -174,7 +174,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -188,7 +188,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
 }
 
 ; fp8 x bf6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,7 +202,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -216,7 +216,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
 }
 
 ; fp8 x fp4
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -230,7 +230,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,7 +244,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
 }
 
 ; bf8 x fp8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -258,7 +258,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -272,7 +272,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
 }
 
 ; bf8 x bf8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -287,7 +287,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
 
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -301,7 +301,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
 }
 
 ; bf8 x fp6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -314,7 +314,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -328,7 +328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
 }
 
 ; bf8 x bf6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -342,7 +342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -356,7 +356,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
 }
 
 ; bf8 x fp4
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -370,7 +370,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -384,7 +384,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
 }
 
 ; fp6 x fp8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -398,7 +398,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -412,7 +412,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
 }
 
 ; fp6 x bf8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -426,7 +426,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -440,7 +440,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
 }
 
 ; fp6 x fp6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -454,7 +454,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -468,7 +468,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons
 }
 
 ; fp6 x bf6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -482,7 +482,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -497,7 +497,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons
 
 
 ; bf6 x fp8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -511,7 +511,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -525,7 +525,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
 }
 
 ; bf6 x bf8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -539,7 +539,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -553,7 +553,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
 }
 
 ; bf6 x fp6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -567,7 +567,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -581,7 +581,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons
 }
 
 ; bf6 x fp4
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -595,7 +595,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -609,7 +609,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons
 }
 
 ; bf6 x bf6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -623,7 +623,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -637,7 +637,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons
 }
 
 ; fp6 x fp4
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -651,7 +651,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -665,7 +665,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons
 }
 
 ; fp4 x fp8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -679,7 +679,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -693,7 +693,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
 }
 
 ; fp4 x bf8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -707,7 +707,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -721,7 +721,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
 }
 
 ; fp4 x fp6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -735,7 +735,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -749,7 +749,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons
 }
 
 ; fp4 x bf6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -763,7 +763,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -777,7 +777,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons
 }
 
 ; fp4 x fp4
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -791,7 +791,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -808,7 +808,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons
 ; Different input parameter classes
 ; --------------------------------------------------------------------
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -821,7 +821,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -833,7 +833,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -845,7 +845,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -890,17 +890,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_writelane_b32 v24, s30, 0
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_writelane_b32 v24, s31, 1
 ; GISEL-NEXT:    v_readfirstlane_b32 s30, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s31, v1
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
-; GISEL-NEXT:    v_readlane_b32 s31, v24, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v24, 0
+; GISEL-NEXT:    v_readlane_b32 s31, v24, 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, v3 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GISEL-NEXT:    scratch_load_dword v24, off, s32 ; 4-byte Folded Reload
@@ -911,7 +911,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -947,7 +947,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -983,7 +983,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1019,7 +1019,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1045,7 +1045,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1087,7 +1087,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1109,7 +1109,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1131,7 +1131,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1470,7 +1470,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 }
 
 ; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1481,7 +1481,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
 }
 
 ; This should be optimized to avoid the scale, with non-0 op_sel arguments.
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1491,7 +1491,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1513,7 +1513,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1539,7 +1539,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
 ; Incorrect signature for format cases (IR vector too large)
 ; --------------------------------------------------------------------
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1552,7 +1552,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1565,7 +1565,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1578,7 +1578,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1591,7 +1591,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1604,7 +1604,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1617,7 +1617,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1630,7 +1630,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1643,7 +1643,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1656,7 +1656,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(
   ret <4 x float> %result
 }
 
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) #2 {
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1679,5 +1679,6 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6
 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
 
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" nounwind }
 attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 481eb1bc3d91a..68c0d78485517 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -37,26 +37,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX7-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX7-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use alloca0 v0
 ; GFX7-NEXT:    ;;#ASMEND
@@ -73,23 +73,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -104,26 +104,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX8-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX8-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
@@ -141,23 +141,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -172,26 +172,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX900-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX900-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX900-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
@@ -208,23 +208,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
 ; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -239,26 +239,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
 ; GFX942-NEXT:    scratch_store_dword off, v23, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX942-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX942-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
@@ -273,23 +273,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4044
 ; GFX942-NEXT:    scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
@@ -305,29 +305,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_1-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -338,23 +338,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_1-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -370,29 +370,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_3-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -403,23 +403,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
 ; GFX10_3-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -434,30 +434,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
 ; GFX11-NEXT:    scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -470,23 +470,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
@@ -505,28 +505,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v23, s33, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v23, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v23, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v23, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v23, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v23, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v23, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -540,23 +540,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -613,24 +613,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX7-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX7-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX7-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX7-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX7-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX7-NEXT:    ;;#ASMEND
@@ -640,23 +640,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -671,24 +671,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX8-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX8-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX8-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX8-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX8-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX8-NEXT:    ;;#ASMEND
@@ -699,23 +699,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -730,24 +730,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX900-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX900-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX900-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX900-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX900-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX900-NEXT:    ;;#ASMEND
@@ -758,23 +758,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
 ; GFX900-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -789,24 +789,25 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4010
 ; GFX942-NEXT:    scratch_store_dword off, v21, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v21, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX942-NEXT:    v_writelane_b32 v21, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v21, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX942-NEXT:    v_writelane_b32 v21, s30, 15
 ; GFX942-NEXT:    s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX942-NEXT:    ;;#ASMEND
@@ -818,23 +819,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x4010
 ; GFX942-NEXT:    scratch_load_dword v21, off, s2 ; 4-byte Folded Reload
@@ -850,24 +851,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_1-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX10_1-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX10_1-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -878,23 +879,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_1-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -910,24 +911,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_3-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX10_3-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX10_3-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -938,23 +939,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
 ; GFX10_3-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -969,24 +970,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
 ; GFX11-NEXT:    scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX11-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX11-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX11-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -999,23 +1000,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
 ; GFX11-NEXT:    scratch_load_b32 v21, off, s1 ; 4-byte Folded Reload
@@ -1034,24 +1035,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v21, s33, 0
 ; GFX12-NEXT:    s_and_b32 s59, 0, exec_lo
-; GFX12-NEXT:    v_writelane_b32 v21, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v21, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v21, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v21, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v21, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v21, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v21, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v21, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v21, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v21, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v21, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v21, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v21, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v21, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v21, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v21, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v21, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v21, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v21, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v21, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v21, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v21, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v21, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v21, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v21, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v21, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v21, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v21, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v21, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v21, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v21, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v21, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -1061,23 +1062,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readlane_b32 s55, v21, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v21, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v21, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v21, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v21, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v21, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v21, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v21, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v21, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v21, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v21, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v21, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v21, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v21, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v21, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v21, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v21, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v21, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v21, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v21, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v21, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v21, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v21, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v21, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v21, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v21, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v21, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v21, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v21, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v21, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v21, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v21, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v21, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1135,30 +1136,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    v_writelane_b32 v23, s28, 17
 ; GFX7-NEXT:    v_writelane_b32 v23, s29, 18
-; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX7-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX7-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX7-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX7-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX7-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX7-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX7-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
 ; GFX7-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX7-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GFX7-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use alloca0 v0
 ; GFX7-NEXT:    ;;#ASMEND
@@ -1169,23 +1170,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX7-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX7-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX7-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX7-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX7-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX7-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX7-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX7-NEXT:    v_readlane_b32 s28, v23, 17
 ; GFX7-NEXT:    v_readlane_b32 s29, v23, 18
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
@@ -1206,30 +1207,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x201100
 ; GFX8-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX8-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX8-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX8-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX8-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX8-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX8-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX8-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX8-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX8-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
 ; GFX8-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use alloca0 v0
 ; GFX8-NEXT:    ;;#ASMEND
@@ -1241,23 +1242,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX8-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX8-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX8-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX8-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX8-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX8-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX8-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
 ; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1276,30 +1277,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x201100
 ; GFX900-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    v_writelane_b32 v23, s30, 0
-; GFX900-NEXT:    v_writelane_b32 v23, s31, 1
-; GFX900-NEXT:    v_writelane_b32 v23, s33, 2
-; GFX900-NEXT:    v_writelane_b32 v23, s34, 3
-; GFX900-NEXT:    v_writelane_b32 v23, s35, 4
-; GFX900-NEXT:    v_writelane_b32 v23, s36, 5
-; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
-; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
-; GFX900-NEXT:    v_writelane_b32 v23, s48, 9
-; GFX900-NEXT:    v_writelane_b32 v23, s49, 10
-; GFX900-NEXT:    v_writelane_b32 v23, s50, 11
-; GFX900-NEXT:    v_writelane_b32 v23, s51, 12
-; GFX900-NEXT:    v_writelane_b32 v23, s52, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s33, 0
+; GFX900-NEXT:    v_writelane_b32 v23, s34, 1
+; GFX900-NEXT:    v_writelane_b32 v23, s35, 2
+; GFX900-NEXT:    v_writelane_b32 v23, s36, 3
+; GFX900-NEXT:    v_writelane_b32 v23, s37, 4
+; GFX900-NEXT:    v_writelane_b32 v23, s38, 5
+; GFX900-NEXT:    v_writelane_b32 v23, s39, 6
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 7
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 8
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 13
 ; GFX900-NEXT:    s_lshr_b32 s5, s32, 6
-; GFX900-NEXT:    v_writelane_b32 v23, s53, 14
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 14
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    s_add_i32 s4, s5, 0x4240
 ; GFX900-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX900-NEXT:    v_writelane_b32 v23, s54, 15
+; GFX900-NEXT:    v_writelane_b32 v23, s30, 15
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    v_writelane_b32 v22, s4, 0
 ; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT:    v_writelane_b32 v23, s55, 16
+; GFX900-NEXT:    v_writelane_b32 v23, s31, 16
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use alloca0 v0
 ; GFX900-NEXT:    ;;#ASMEND
@@ -1311,23 +1312,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_readlane_b32 s55, v23, 16
-; GFX900-NEXT:    v_readlane_b32 s54, v23, 15
-; GFX900-NEXT:    v_readlane_b32 s53, v23, 14
-; GFX900-NEXT:    v_readlane_b32 s52, v23, 13
-; GFX900-NEXT:    v_readlane_b32 s51, v23, 12
-; GFX900-NEXT:    v_readlane_b32 s50, v23, 11
-; GFX900-NEXT:    v_readlane_b32 s49, v23, 10
-; GFX900-NEXT:    v_readlane_b32 s48, v23, 9
-; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
-; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
-; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
-; GFX900-NEXT:    v_readlane_b32 s36, v23, 5
-; GFX900-NEXT:    v_readlane_b32 s35, v23, 4
-; GFX900-NEXT:    v_readlane_b32 s34, v23, 3
-; GFX900-NEXT:    v_readlane_b32 s33, v23, 2
-; GFX900-NEXT:    v_readlane_b32 s31, v23, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX900-NEXT:    v_readlane_b32 s30, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s31, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 8
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 7
+; GFX900-NEXT:    v_readlane_b32 s39, v23, 6
+; GFX900-NEXT:    v_readlane_b32 s38, v23, 5
+; GFX900-NEXT:    v_readlane_b32 s37, v23, 4
+; GFX900-NEXT:    v_readlane_b32 s36, v23, 3
+; GFX900-NEXT:    v_readlane_b32 s35, v23, 2
+; GFX900-NEXT:    v_readlane_b32 s34, v23, 1
+; GFX900-NEXT:    v_readlane_b32 s33, v23, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
 ; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1344,28 +1345,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
 ; GFX942-NEXT:    scratch_store_dword off, v22, s2 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v22, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v22, s31, 1
-; GFX942-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX942-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX942-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX942-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX942-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX942-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX942-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX942-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX942-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX942-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX942-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX942-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX942-NEXT:    v_writelane_b32 v22, s53, 14
+; GFX942-NEXT:    v_writelane_b32 v22, s33, 0
+; GFX942-NEXT:    v_writelane_b32 v22, s34, 1
+; GFX942-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX942-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX942-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX942-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX942-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX942-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX942-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX942-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX942-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX942-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX942-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX942-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX942-NEXT:    v_writelane_b32 v22, s55, 14
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
-; GFX942-NEXT:    v_writelane_b32 v22, s54, 15
+; GFX942-NEXT:    v_writelane_b32 v22, s30, 15
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
-; GFX942-NEXT:    v_writelane_b32 v22, s55, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use alloca0 v0
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1376,23 +1378,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX942-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX942-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX942-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX942-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX942-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX942-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX942-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX942-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX942-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX942-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX942-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX942-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX942-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX942-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX942-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX942-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX942-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX942-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX942-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX942-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX942-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX942-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX942-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX942-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX942-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX942-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX942-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX942-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX942-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX942-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX942-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    s_add_i32 s2, s32, 0x8040
 ; GFX942-NEXT:    scratch_load_dword v22, off, s2 ; 4-byte Folded Reload
@@ -1408,31 +1410,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_1-NEXT:    buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_1-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_1-NEXT:    s_lshr_b32 s4, s32, 5
 ; GFX10_1-NEXT:    s_add_i32 s58, s4, 0x4240
-; GFX10_1-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX10_1-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX10_1-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX10_1-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX10_1-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX10_1-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX10_1-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX10_1-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX10_1-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX10_1-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX10_1-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX10_1-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX10_1-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX10_1-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX10_1-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX10_1-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX10_1-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX10_1-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX10_1-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX10_1-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX10_1-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX10_1-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX10_1-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX10_1-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX10_1-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX10_1-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX10_1-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX10_1-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX10_1-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX10_1-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX10_1-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -1441,23 +1443,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX10_1-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX10_1-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX10_1-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX10_1-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX10_1-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX10_1-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX10_1-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX10_1-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX10_1-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX10_1-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX10_1-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX10_1-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX10_1-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX10_1-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX10_1-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX10_1-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX10_1-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX10_1-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX10_1-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX10_1-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX10_1-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX10_1-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX10_1-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX10_1-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX10_1-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX10_1-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX10_1-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX10_1-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX10_1-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX10_1-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX10_1-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX10_1-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX10_1-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_1-NEXT:    buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1473,31 +1475,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_3-NEXT:    buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
 ; GFX10_3-NEXT:    s_lshr_b32 s4, s32, 5
 ; GFX10_3-NEXT:    s_add_i32 s58, s4, 0x4240
-; GFX10_3-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX10_3-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX10_3-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX10_3-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX10_3-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX10_3-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX10_3-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX10_3-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX10_3-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX10_3-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX10_3-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX10_3-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX10_3-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX10_3-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX10_3-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX10_3-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX10_3-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX10_3-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX10_3-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX10_3-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX10_3-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX10_3-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX10_3-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX10_3-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX10_3-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX10_3-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX10_3-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX10_3-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX10_3-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX10_3-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX10_3-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -1506,23 +1508,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX10_3-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX10_3-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX10_3-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX10_3-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX10_3-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX10_3-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX10_3-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX10_3-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX10_3-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX10_3-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX10_3-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX10_3-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX10_3-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX10_3-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX10_3-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX10_3-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX10_3-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX10_3-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX10_3-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX10_3-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX10_3-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX10_3-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX10_3-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX10_3-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX10_3-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX10_3-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX10_3-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX10_3-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX10_3-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX10_3-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX10_3-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX10_3-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX10_3-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
 ; GFX10_3-NEXT:    buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1537,30 +1539,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
 ; GFX11-NEXT:    scratch_store_b32 off, v22, s1 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
 ; GFX11-NEXT:    s_add_i32 s58, s32, 0x4240
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v22, s34, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX11-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX11-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX11-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX11-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX11-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX11-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX11-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX11-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX11-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX11-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX11-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX11-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX11-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX11-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX11-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX11-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX11-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX11-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX11-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX11-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX11-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX11-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX11-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX11-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX11-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX11-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX11-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX11-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX11-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX11-NEXT:    ;;#ASMEND
@@ -1570,23 +1572,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX11-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX11-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX11-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX11-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX11-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX11-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX11-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX11-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX11-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX11-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX11-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX11-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX11-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX11-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX11-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX11-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX11-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX11-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX11-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX11-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX11-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX11-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX11-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX11-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX11-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX11-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX11-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX11-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX11-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX11-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX11-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
 ; GFX11-NEXT:    scratch_load_b32 v22, off, s1 ; 4-byte Folded Reload
@@ -1605,29 +1607,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    scratch_store_b32 off, v22, s32 offset:32768 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v22, s33, 0
 ; GFX12-NEXT:    s_add_co_i32 s58, s32, 0x4200
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v22, s31, 1
-; GFX12-NEXT:    v_writelane_b32 v22, s33, 2
-; GFX12-NEXT:    v_writelane_b32 v22, s34, 3
-; GFX12-NEXT:    v_writelane_b32 v22, s35, 4
-; GFX12-NEXT:    v_writelane_b32 v22, s36, 5
-; GFX12-NEXT:    v_writelane_b32 v22, s37, 6
-; GFX12-NEXT:    v_writelane_b32 v22, s38, 7
-; GFX12-NEXT:    v_writelane_b32 v22, s39, 8
-; GFX12-NEXT:    v_writelane_b32 v22, s48, 9
-; GFX12-NEXT:    v_writelane_b32 v22, s49, 10
-; GFX12-NEXT:    v_writelane_b32 v22, s50, 11
-; GFX12-NEXT:    v_writelane_b32 v22, s51, 12
-; GFX12-NEXT:    v_writelane_b32 v22, s52, 13
-; GFX12-NEXT:    v_writelane_b32 v22, s53, 14
-; GFX12-NEXT:    v_writelane_b32 v22, s54, 15
-; GFX12-NEXT:    v_writelane_b32 v22, s55, 16
+; GFX12-NEXT:    v_writelane_b32 v22, s34, 1
+; GFX12-NEXT:    v_writelane_b32 v22, s35, 2
+; GFX12-NEXT:    v_writelane_b32 v22, s36, 3
+; GFX12-NEXT:    v_writelane_b32 v22, s37, 4
+; GFX12-NEXT:    v_writelane_b32 v22, s38, 5
+; GFX12-NEXT:    v_writelane_b32 v22, s39, 6
+; GFX12-NEXT:    v_writelane_b32 v22, s48, 7
+; GFX12-NEXT:    v_writelane_b32 v22, s49, 8
+; GFX12-NEXT:    v_writelane_b32 v22, s50, 9
+; GFX12-NEXT:    v_writelane_b32 v22, s51, 10
+; GFX12-NEXT:    v_writelane_b32 v22, s52, 11
+; GFX12-NEXT:    v_writelane_b32 v22, s53, 12
+; GFX12-NEXT:    v_writelane_b32 v22, s54, 13
+; GFX12-NEXT:    v_writelane_b32 v22, s55, 14
+; GFX12-NEXT:    v_writelane_b32 v22, s30, 15
+; GFX12-NEXT:    v_writelane_b32 v22, s31, 16
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -1637,23 +1639,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_readlane_b32 s55, v22, 16
-; GFX12-NEXT:    v_readlane_b32 s54, v22, 15
-; GFX12-NEXT:    v_readlane_b32 s53, v22, 14
-; GFX12-NEXT:    v_readlane_b32 s52, v22, 13
-; GFX12-NEXT:    v_readlane_b32 s51, v22, 12
-; GFX12-NEXT:    v_readlane_b32 s50, v22, 11
-; GFX12-NEXT:    v_readlane_b32 s49, v22, 10
-; GFX12-NEXT:    v_readlane_b32 s48, v22, 9
-; GFX12-NEXT:    v_readlane_b32 s39, v22, 8
-; GFX12-NEXT:    v_readlane_b32 s38, v22, 7
-; GFX12-NEXT:    v_readlane_b32 s37, v22, 6
-; GFX12-NEXT:    v_readlane_b32 s36, v22, 5
-; GFX12-NEXT:    v_readlane_b32 s35, v22, 4
-; GFX12-NEXT:    v_readlane_b32 s34, v22, 3
-; GFX12-NEXT:    v_readlane_b32 s33, v22, 2
-; GFX12-NEXT:    v_readlane_b32 s31, v22, 1
-; GFX12-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX12-NEXT:    v_readlane_b32 s30, v22, 15
+; GFX12-NEXT:    v_readlane_b32 s31, v22, 16
+; GFX12-NEXT:    v_readlane_b32 s55, v22, 14
+; GFX12-NEXT:    v_readlane_b32 s54, v22, 13
+; GFX12-NEXT:    v_readlane_b32 s53, v22, 12
+; GFX12-NEXT:    v_readlane_b32 s52, v22, 11
+; GFX12-NEXT:    v_readlane_b32 s51, v22, 10
+; GFX12-NEXT:    v_readlane_b32 s50, v22, 9
+; GFX12-NEXT:    v_readlane_b32 s49, v22, 8
+; GFX12-NEXT:    v_readlane_b32 s48, v22, 7
+; GFX12-NEXT:    v_readlane_b32 s39, v22, 6
+; GFX12-NEXT:    v_readlane_b32 s38, v22, 5
+; GFX12-NEXT:    v_readlane_b32 s37, v22, 4
+; GFX12-NEXT:    v_readlane_b32 s36, v22, 3
+; GFX12-NEXT:    v_readlane_b32 s35, v22, 2
+; GFX12-NEXT:    v_readlane_b32 s34, v22, 1
+; GFX12-NEXT:    v_readlane_b32 s33, v22, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v22, off, s32 offset:32768 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 9abd8110e5529..87a5012dda7e2 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -194,22 +194,22 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    v_writelane_b32 v43, s4, 5
-; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v43, s36, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v43, s37, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 2
 ; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v41, v40
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 4
 ; GFX9-NEXT:    s_mov_b32 s34, s15
 ; GFX9-NEXT:    v_and_b32_e32 v42, 0xffffff, v40
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -224,11 +224,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s37, v43, 4
-; GFX9-NEXT:    v_readlane_b32 s36, v43, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v43, 5
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 65446a036c91b..ccc1ce5316ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -47,8 +47,8 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; clobber csr v40
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
@@ -190,8 +190,8 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -224,8 +224,8 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -271,6 +271,6 @@ entry:
   ret void
 }
 
-attributes #0 = { "frame-pointer"="none" noinline }
-attributes #1 = { "frame-pointer"="all" noinline }
+attributes #0 = { "frame-pointer"="none" noinline nounwind }
+attributes #1 = { "frame-pointer"="all" noinline nounwind }
 
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index ccaf0ac5377e4..8394b325bee6d 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -29,8 +29,8 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -68,8 +68,8 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 6b6c60ebe2a9e..0da206e2485c4 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -247,8 +247,8 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:    .loc 0 32 1 ; lane-info.cpp:32:1
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -267,7 +267,7 @@ entry:
 ; Function Attrs: convergent nounwind
 declare void @_ZL13sleep_foreverv() #0
 
-attributes #0 = { "frame-pointer"="all" }
+attributes #0 = { "frame-pointer"="all" nounwind }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!1638, !1639, !1640, !1641}
diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
index 1ee886541083e..cf9cfc47f10f8 100644
--- a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
@@ -35,9 +35,9 @@ define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) #0 {
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v2
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s30, v4, 0
 ; CHECK-NEXT:    v_min_f32_e32 v0, v3, v0
 ; CHECK-NEXT:    v_readlane_b32 s31, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v4, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -85,10 +85,10 @@ define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) #0 {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v2
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    v_min_f32_e32 v0, v4, v0
 ; CHECK-NEXT:    v_min_f32_e32 v1, v5, v1
 ; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -136,9 +136,9 @@ define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) #0 {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v5
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v4
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -209,8 +209,8 @@ define nofpclass(nan inf) { double, double } @aggregate() #0 {
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -247,10 +247,10 @@ define { float, float } @aggregate_use(float %z) #0 {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_max_f32_e32 v2, v40, v40
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
 ; CHECK-NEXT:    v_min_f32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v41, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -317,13 +317,13 @@ define <5 x double> @call_nofpclass_funcs_v5f64_non_mvt_vector(ptr addrspace(1)
 ; CHECK-NEXT:    v_mov_b32_e32 v20, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v21, v9
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s30, v24, 0
 ; CHECK-NEXT:    v_min_f64 v[0:1], v[12:13], v[0:1]
 ; CHECK-NEXT:    v_min_f64 v[2:3], v[14:15], v[2:3]
 ; CHECK-NEXT:    v_min_f64 v[4:5], v[16:17], v[4:5]
 ; CHECK-NEXT:    v_min_f64 v[6:7], v[18:19], v[6:7]
 ; CHECK-NEXT:    v_min_f64 v[8:9], v[20:21], v[8:9]
 ; CHECK-NEXT:    v_readlane_b32 s31, v24, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v24, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v24, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 1521ad5219174..6fefed6e07f2d 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -338,8 +338,8 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX906-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX906-NEXT:    s_mov_b32 s32, s33
 ; GFX906-NEXT:    v_readlane_b32 s4, v41, 4
 ; GFX906-NEXT:    v_readlane_b32 s34, v41, 2
@@ -398,21 +398,14 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_addk_i32 s32, 0x2c00
 ; GFX908-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_mov_b64 s[16:17], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
+; GFX908-NEXT:    s_mov_b64 exec, 3
 ; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX908-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX908-NEXT:    s_mov_b64 s[16:17], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:168
-; GFX908-NEXT:    v_writelane_b32 v2, s31, 0
-; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:168
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
 ; GFX908-NEXT:    s_mov_b32 s21, s15
 ; GFX908-NEXT:    ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
 ; GFX908-NEXT:    s_mov_b32 s22, s14
@@ -755,20 +748,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, 1
-; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:168
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readlane_b32 s31, v0, 0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    s_mov_b64 s[4:5], exec
-; GFX908-NEXT:    s_mov_b64 exec, 1
+; GFX908-NEXT:    s_mov_b64 exec, 3
 ; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX908-NEXT:    v_readlane_b32 s31, v0, 1
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 9d8a54b4cc178..f02b895cc6e7d 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 < %s | FileCheck %s -check-prefix=GFX11
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 < %s | FileCheck %s -check-prefix=GFX12
 
-define void @test_remat_s_getpc_b64() {
+define void @test_remat_s_getpc_b64() #0 {
 ; GFX9-LABEL: test_remat_s_getpc_b64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20,9 +20,9 @@ define void @test_remat_s_getpc_b64() {
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -45,8 +45,8 @@ define void @test_remat_s_getpc_b64() {
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
@@ -79,8 +79,8 @@ define void @test_remat_s_getpc_b64() {
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX12-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
@@ -97,3 +97,5 @@ entry:
 }
 
 declare i64 @llvm.amdgcn.s.getpc()
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 9005e26d24abb..7fb73949fce57 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,16 +28,16 @@ body:             |
   ; GCN-LABEL: name: test_main
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0
+  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $vcc_hi = frame-setup COPY $sgpr33
   ; GCN-NEXT:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2
@@ -66,48 +66,48 @@ body:             |
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2
   ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr2
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr3
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr4
-  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr4
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 26, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 27, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 28, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 29, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 30, $vgpr2
+  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 31, $vgpr2
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 0, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 1, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 2, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 3, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 4, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 5, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 6, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 7, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 8, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 9, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 10, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 11, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 12, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 13, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 14, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 15, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 16, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 17, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 18, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 19, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 20, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 21, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 22, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 23, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 24, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 25, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 26, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 27, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 28, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 29, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 30, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 31, $vgpr3
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 0, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 1, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr30, 2, $vgpr4, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr31, 3, $vgpr4, implicit $sgpr30_sgpr31
   ; GCN-NEXT:   $sgpr22 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr5 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
@@ -130,48 +130,48 @@ body:             |
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   liveins: $vcc_hi
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
-  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
-  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
-  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
-  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
-  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
-  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
-  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
-  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
-  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
-  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
-  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
-  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
-  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
-  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
-  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
-  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
-  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
-  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
-  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
-  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
-  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
-  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
-  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
-  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
-  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
-  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
-  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
-  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
-  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
-  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
-  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
-  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
-  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
-  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
-  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
-  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
-  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
-  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
-  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
-  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
-  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
+  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2, implicit-def $sgpr30_sgpr31
+  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
+  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
+  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
+  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
+  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
+  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
+  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
+  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
+  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
+  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
+  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
+  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
+  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
+  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
+  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
+  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
+  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
+  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
+  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
+  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
+  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
+  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
+  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
+  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
+  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
+  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
+  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
+  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
+  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
+  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
+  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
+  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
+  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
+  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
+  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
+  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
+  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
+  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
+  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
+  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
   ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25
   ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24
   ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23
@@ -200,11 +200,11 @@ body:             |
   ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; GCN-NEXT:   $sgpr32 = frame-destroy COPY $sgpr33
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.69, addrspace 5)
-  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.70, addrspace 5)
-  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.71, addrspace 5)
-  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.72, addrspace 5)
-  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.73, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.68, addrspace 5)
+  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.69, addrspace 5)
+  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.70, addrspace 5)
+  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.71, addrspace 5)
+  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.72, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr33 = frame-destroy COPY $vcc_hi
   ; GCN-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 979788c9231d0..bf9fe16f59544 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -152,8 +152,8 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v255, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v255, 0
+; GCN-NEXT:    v_readlane_b32 s31, v255, 1
 ; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -445,8 +445,8 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v254, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v254, 0
+; GCN-NEXT:    v_readlane_b32 s31, v254, 1
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -1636,21 +1636,14 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 s[16:17], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
+; GCN-NEXT:    s_mov_b64 exec, 3
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    v_writelane_b32 v0, s30, 0
+; GCN-NEXT:    v_writelane_b32 v0, s31, 1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    s_mov_b64 s[16:17], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    v_writelane_b32 v0, s31, 0
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[16:17]
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, child_function_ipra at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, child_function_ipra at rel32@hi+12
@@ -1660,20 +1653,12 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[22:23]
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v0, 0
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b64 exec, 1
+; GCN-NEXT:    s_mov_b64 exec, 3
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s30, v0, 0
+; GCN-NEXT:    v_readlane_b32 s31, v0, 1
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index 1c2215d39dc02..5ead5e768af5d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX942 %s
 
 
-define void @v_shuffle_v2i64_v8i64__u_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX9-LABEL: v_shuffle_v2i64_v8i64__u_u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15,7 +15,7 @@ define void @v_shuffle_v2i64_v8i64__u_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -54,7 +54,7 @@ define void @v_shuffle_v2i64_v8i64__0_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93,7 +93,7 @@ define void @v_shuffle_v2i64_v8i64__1_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -132,7 +132,7 @@ define void @v_shuffle_v2i64_v8i64__2_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -171,7 +171,7 @@ define void @v_shuffle_v2i64_v8i64__3_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -210,7 +210,7 @@ define void @v_shuffle_v2i64_v8i64__4_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -249,7 +249,7 @@ define void @v_shuffle_v2i64_v8i64__5_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -288,7 +288,7 @@ define void @v_shuffle_v2i64_v8i64__6_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -327,7 +327,7 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX9-LABEL: v_shuffle_v2i64_v8i64__8_u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -338,7 +338,7 @@ define void @v_shuffle_v2i64_v8i64__8_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -378,7 +378,7 @@ define void @v_shuffle_v2i64_v8i64__9_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -418,7 +418,7 @@ define void @v_shuffle_v2i64_v8i64__10_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -458,7 +458,7 @@ define void @v_shuffle_v2i64_v8i64__11_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -498,7 +498,7 @@ define void @v_shuffle_v2i64_v8i64__12_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -538,7 +538,7 @@ define void @v_shuffle_v2i64_v8i64__13_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -578,7 +578,7 @@ define void @v_shuffle_v2i64_v8i64__14_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -618,7 +618,7 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -667,7 +667,7 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -717,7 +717,7 @@ define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -767,7 +767,7 @@ define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -817,7 +817,7 @@ define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -867,7 +867,7 @@ define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -917,7 +917,7 @@ define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -967,7 +967,7 @@ define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1017,7 +1017,7 @@ define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1064,7 +1064,7 @@ define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1111,7 +1111,7 @@ define void @v_shuffle_v2i64_v8i64__15_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1158,7 +1158,7 @@ define void @v_shuffle_v2i64_v8i64__15_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1205,7 +1205,7 @@ define void @v_shuffle_v2i64_v8i64__15_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1252,7 +1252,7 @@ define void @v_shuffle_v2i64_v8i64__15_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1299,7 +1299,7 @@ define void @v_shuffle_v2i64_v8i64__15_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1346,7 +1346,7 @@ define void @v_shuffle_v2i64_v8i64__15_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__15_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__15_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1393,7 +1393,7 @@ define void @v_shuffle_v2i64_v8i64__15_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1432,7 +1432,7 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1477,7 +1477,7 @@ define void @v_shuffle_v2i64_v8i64__0_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1522,7 +1522,7 @@ define void @v_shuffle_v2i64_v8i64__1_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1567,7 +1567,7 @@ define void @v_shuffle_v2i64_v8i64__2_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1612,7 +1612,7 @@ define void @v_shuffle_v2i64_v8i64__3_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1657,7 +1657,7 @@ define void @v_shuffle_v2i64_v8i64__4_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1702,7 +1702,7 @@ define void @v_shuffle_v2i64_v8i64__5_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1747,7 +1747,7 @@ define void @v_shuffle_v2i64_v8i64__6_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1793,7 +1793,7 @@ define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1832,7 +1832,7 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1888,7 +1888,7 @@ define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1944,7 +1944,7 @@ define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2000,7 +2000,7 @@ define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2056,7 +2056,7 @@ define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2112,7 +2112,7 @@ define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2168,7 +2168,7 @@ define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2207,7 +2207,7 @@ define void @v_shuffle_v2i64_v8i64__u_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2246,7 +2246,7 @@ define void @v_shuffle_v2i64_v8i64__0_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2291,7 +2291,7 @@ define void @v_shuffle_v2i64_v8i64__1_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2336,7 +2336,7 @@ define void @v_shuffle_v2i64_v8i64__2_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2381,7 +2381,7 @@ define void @v_shuffle_v2i64_v8i64__3_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2426,7 +2426,7 @@ define void @v_shuffle_v2i64_v8i64__4_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2471,7 +2471,7 @@ define void @v_shuffle_v2i64_v8i64__5_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2516,7 +2516,7 @@ define void @v_shuffle_v2i64_v8i64__6_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2562,7 +2562,7 @@ define void @v_shuffle_v2i64_v8i64__7_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2601,7 +2601,7 @@ define void @v_shuffle_v2i64_v8i64__8_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2657,7 +2657,7 @@ define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2713,7 +2713,7 @@ define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2769,7 +2769,7 @@ define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2825,7 +2825,7 @@ define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2881,7 +2881,7 @@ define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2937,7 +2937,7 @@ define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2976,7 +2976,7 @@ define void @v_shuffle_v2i64_v8i64__u_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3021,7 +3021,7 @@ define void @v_shuffle_v2i64_v8i64__0_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3060,7 +3060,7 @@ define void @v_shuffle_v2i64_v8i64__1_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3105,7 +3105,7 @@ define void @v_shuffle_v2i64_v8i64__2_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3150,7 +3150,7 @@ define void @v_shuffle_v2i64_v8i64__3_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3195,7 +3195,7 @@ define void @v_shuffle_v2i64_v8i64__4_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3240,7 +3240,7 @@ define void @v_shuffle_v2i64_v8i64__5_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3285,7 +3285,7 @@ define void @v_shuffle_v2i64_v8i64__6_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3331,7 +3331,7 @@ define void @v_shuffle_v2i64_v8i64__7_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3370,7 +3370,7 @@ define void @v_shuffle_v2i64_v8i64__8_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3426,7 +3426,7 @@ define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3482,7 +3482,7 @@ define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3538,7 +3538,7 @@ define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3594,7 +3594,7 @@ define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3650,7 +3650,7 @@ define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3706,7 +3706,7 @@ define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3745,7 +3745,7 @@ define void @v_shuffle_v2i64_v8i64__u_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3790,7 +3790,7 @@ define void @v_shuffle_v2i64_v8i64__0_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3835,7 +3835,7 @@ define void @v_shuffle_v2i64_v8i64__1_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3874,7 +3874,7 @@ define void @v_shuffle_v2i64_v8i64__2_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3919,7 +3919,7 @@ define void @v_shuffle_v2i64_v8i64__3_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3964,7 +3964,7 @@ define void @v_shuffle_v2i64_v8i64__4_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4009,7 +4009,7 @@ define void @v_shuffle_v2i64_v8i64__5_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4054,7 +4054,7 @@ define void @v_shuffle_v2i64_v8i64__6_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4100,7 +4100,7 @@ define void @v_shuffle_v2i64_v8i64__7_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4139,7 +4139,7 @@ define void @v_shuffle_v2i64_v8i64__8_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4195,7 +4195,7 @@ define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4251,7 +4251,7 @@ define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4307,7 +4307,7 @@ define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4363,7 +4363,7 @@ define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4419,7 +4419,7 @@ define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4475,7 +4475,7 @@ define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4514,7 +4514,7 @@ define void @v_shuffle_v2i64_v8i64__u_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4559,7 +4559,7 @@ define void @v_shuffle_v2i64_v8i64__0_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4604,7 +4604,7 @@ define void @v_shuffle_v2i64_v8i64__1_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4649,7 +4649,7 @@ define void @v_shuffle_v2i64_v8i64__2_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4688,7 +4688,7 @@ define void @v_shuffle_v2i64_v8i64__3_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4733,7 +4733,7 @@ define void @v_shuffle_v2i64_v8i64__4_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4778,7 +4778,7 @@ define void @v_shuffle_v2i64_v8i64__5_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4823,7 +4823,7 @@ define void @v_shuffle_v2i64_v8i64__6_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4869,7 +4869,7 @@ define void @v_shuffle_v2i64_v8i64__7_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4908,7 +4908,7 @@ define void @v_shuffle_v2i64_v8i64__8_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4964,7 +4964,7 @@ define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5020,7 +5020,7 @@ define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5076,7 +5076,7 @@ define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5132,7 +5132,7 @@ define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5188,7 +5188,7 @@ define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5244,7 +5244,7 @@ define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5283,7 +5283,7 @@ define void @v_shuffle_v2i64_v8i64__u_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5328,7 +5328,7 @@ define void @v_shuffle_v2i64_v8i64__0_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5373,7 +5373,7 @@ define void @v_shuffle_v2i64_v8i64__1_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5418,7 +5418,7 @@ define void @v_shuffle_v2i64_v8i64__2_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5463,7 +5463,7 @@ define void @v_shuffle_v2i64_v8i64__3_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5502,7 +5502,7 @@ define void @v_shuffle_v2i64_v8i64__4_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5547,7 +5547,7 @@ define void @v_shuffle_v2i64_v8i64__5_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5592,7 +5592,7 @@ define void @v_shuffle_v2i64_v8i64__6_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5638,7 +5638,7 @@ define void @v_shuffle_v2i64_v8i64__7_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5677,7 +5677,7 @@ define void @v_shuffle_v2i64_v8i64__8_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5733,7 +5733,7 @@ define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5789,7 +5789,7 @@ define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5845,7 +5845,7 @@ define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5901,7 +5901,7 @@ define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5957,7 +5957,7 @@ define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6013,7 +6013,7 @@ define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6052,7 +6052,7 @@ define void @v_shuffle_v2i64_v8i64__u_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6097,7 +6097,7 @@ define void @v_shuffle_v2i64_v8i64__0_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6142,7 +6142,7 @@ define void @v_shuffle_v2i64_v8i64__1_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6187,7 +6187,7 @@ define void @v_shuffle_v2i64_v8i64__2_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6232,7 +6232,7 @@ define void @v_shuffle_v2i64_v8i64__3_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6277,7 +6277,7 @@ define void @v_shuffle_v2i64_v8i64__4_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6316,7 +6316,7 @@ define void @v_shuffle_v2i64_v8i64__5_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6361,7 +6361,7 @@ define void @v_shuffle_v2i64_v8i64__6_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6407,7 +6407,7 @@ define void @v_shuffle_v2i64_v8i64__7_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6446,7 +6446,7 @@ define void @v_shuffle_v2i64_v8i64__8_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6502,7 +6502,7 @@ define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6558,7 +6558,7 @@ define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6614,7 +6614,7 @@ define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6670,7 +6670,7 @@ define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6726,7 +6726,7 @@ define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6782,7 +6782,7 @@ define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6821,7 +6821,7 @@ define void @v_shuffle_v2i64_v8i64__u_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6866,7 +6866,7 @@ define void @v_shuffle_v2i64_v8i64__0_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6911,7 +6911,7 @@ define void @v_shuffle_v2i64_v8i64__1_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6956,7 +6956,7 @@ define void @v_shuffle_v2i64_v8i64__2_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7001,7 +7001,7 @@ define void @v_shuffle_v2i64_v8i64__3_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7046,7 +7046,7 @@ define void @v_shuffle_v2i64_v8i64__4_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7091,7 +7091,7 @@ define void @v_shuffle_v2i64_v8i64__5_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7130,7 +7130,7 @@ define void @v_shuffle_v2i64_v8i64__6_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7176,7 +7176,7 @@ define void @v_shuffle_v2i64_v8i64__7_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7215,7 +7215,7 @@ define void @v_shuffle_v2i64_v8i64__8_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7271,7 +7271,7 @@ define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7327,7 +7327,7 @@ define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7383,7 +7383,7 @@ define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7439,7 +7439,7 @@ define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7495,7 +7495,7 @@ define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7551,7 +7551,7 @@ define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX9-LABEL: v_shuffle_v2i64_v8i64__u_8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7562,7 +7562,7 @@ define void @v_shuffle_v2i64_v8i64__u_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7601,7 +7601,7 @@ define void @v_shuffle_v2i64_v8i64__0_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7640,7 +7640,7 @@ define void @v_shuffle_v2i64_v8i64__1_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7679,7 +7679,7 @@ define void @v_shuffle_v2i64_v8i64__2_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7718,7 +7718,7 @@ define void @v_shuffle_v2i64_v8i64__3_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7757,7 +7757,7 @@ define void @v_shuffle_v2i64_v8i64__4_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7796,7 +7796,7 @@ define void @v_shuffle_v2i64_v8i64__5_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7835,7 +7835,7 @@ define void @v_shuffle_v2i64_v8i64__6_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7874,7 +7874,7 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX9-LABEL: v_shuffle_v2i64_v8i64__8_8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7885,7 +7885,7 @@ define void @v_shuffle_v2i64_v8i64__8_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7931,7 +7931,7 @@ define void @v_shuffle_v2i64_v8i64__9_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7977,7 +7977,7 @@ define void @v_shuffle_v2i64_v8i64__10_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8023,7 +8023,7 @@ define void @v_shuffle_v2i64_v8i64__11_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8069,7 +8069,7 @@ define void @v_shuffle_v2i64_v8i64__12_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8115,7 +8115,7 @@ define void @v_shuffle_v2i64_v8i64__13_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_8(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_8(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8161,7 +8161,7 @@ define void @v_shuffle_v2i64_v8i64__14_8(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8201,7 +8201,7 @@ define void @v_shuffle_v2i64_v8i64__u_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8257,7 +8257,7 @@ define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8313,7 +8313,7 @@ define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8369,7 +8369,7 @@ define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8425,7 +8425,7 @@ define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8481,7 +8481,7 @@ define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8537,7 +8537,7 @@ define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8593,7 +8593,7 @@ define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8648,7 +8648,7 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8688,7 +8688,7 @@ define void @v_shuffle_v2i64_v8i64__8_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8734,7 +8734,7 @@ define void @v_shuffle_v2i64_v8i64__9_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8780,7 +8780,7 @@ define void @v_shuffle_v2i64_v8i64__10_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8826,7 +8826,7 @@ define void @v_shuffle_v2i64_v8i64__11_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8872,7 +8872,7 @@ define void @v_shuffle_v2i64_v8i64__12_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8918,7 +8918,7 @@ define void @v_shuffle_v2i64_v8i64__13_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_9(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_9(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8964,7 +8964,7 @@ define void @v_shuffle_v2i64_v8i64__14_9(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9004,7 +9004,7 @@ define void @v_shuffle_v2i64_v8i64__u_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9060,7 +9060,7 @@ define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9116,7 +9116,7 @@ define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9172,7 +9172,7 @@ define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9228,7 +9228,7 @@ define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9284,7 +9284,7 @@ define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9340,7 +9340,7 @@ define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9396,7 +9396,7 @@ define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9451,7 +9451,7 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9497,7 +9497,7 @@ define void @v_shuffle_v2i64_v8i64__8_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9537,7 +9537,7 @@ define void @v_shuffle_v2i64_v8i64__9_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9583,7 +9583,7 @@ define void @v_shuffle_v2i64_v8i64__10_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9629,7 +9629,7 @@ define void @v_shuffle_v2i64_v8i64__11_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9675,7 +9675,7 @@ define void @v_shuffle_v2i64_v8i64__12_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9721,7 +9721,7 @@ define void @v_shuffle_v2i64_v8i64__13_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_10(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_10(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9767,7 +9767,7 @@ define void @v_shuffle_v2i64_v8i64__14_10(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9807,7 +9807,7 @@ define void @v_shuffle_v2i64_v8i64__u_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9863,7 +9863,7 @@ define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9919,7 +9919,7 @@ define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9975,7 +9975,7 @@ define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10031,7 +10031,7 @@ define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10087,7 +10087,7 @@ define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10143,7 +10143,7 @@ define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10199,7 +10199,7 @@ define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10254,7 +10254,7 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10300,7 +10300,7 @@ define void @v_shuffle_v2i64_v8i64__8_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10346,7 +10346,7 @@ define void @v_shuffle_v2i64_v8i64__9_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10386,7 +10386,7 @@ define void @v_shuffle_v2i64_v8i64__10_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10432,7 +10432,7 @@ define void @v_shuffle_v2i64_v8i64__11_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10478,7 +10478,7 @@ define void @v_shuffle_v2i64_v8i64__12_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10524,7 +10524,7 @@ define void @v_shuffle_v2i64_v8i64__13_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_11(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_11(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10570,7 +10570,7 @@ define void @v_shuffle_v2i64_v8i64__14_11(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10610,7 +10610,7 @@ define void @v_shuffle_v2i64_v8i64__u_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10666,7 +10666,7 @@ define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10722,7 +10722,7 @@ define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10778,7 +10778,7 @@ define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10834,7 +10834,7 @@ define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10890,7 +10890,7 @@ define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10946,7 +10946,7 @@ define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11002,7 +11002,7 @@ define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11057,7 +11057,7 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11103,7 +11103,7 @@ define void @v_shuffle_v2i64_v8i64__8_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11149,7 +11149,7 @@ define void @v_shuffle_v2i64_v8i64__9_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11195,7 +11195,7 @@ define void @v_shuffle_v2i64_v8i64__10_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11235,7 +11235,7 @@ define void @v_shuffle_v2i64_v8i64__11_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11281,7 +11281,7 @@ define void @v_shuffle_v2i64_v8i64__12_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11327,7 +11327,7 @@ define void @v_shuffle_v2i64_v8i64__13_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_12(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_12(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11373,7 +11373,7 @@ define void @v_shuffle_v2i64_v8i64__14_12(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11413,7 +11413,7 @@ define void @v_shuffle_v2i64_v8i64__u_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11469,7 +11469,7 @@ define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11525,7 +11525,7 @@ define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11581,7 +11581,7 @@ define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11637,7 +11637,7 @@ define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11693,7 +11693,7 @@ define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11749,7 +11749,7 @@ define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11805,7 +11805,7 @@ define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11860,7 +11860,7 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11906,7 +11906,7 @@ define void @v_shuffle_v2i64_v8i64__8_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11952,7 +11952,7 @@ define void @v_shuffle_v2i64_v8i64__9_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11998,7 +11998,7 @@ define void @v_shuffle_v2i64_v8i64__10_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12044,7 +12044,7 @@ define void @v_shuffle_v2i64_v8i64__11_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12084,7 +12084,7 @@ define void @v_shuffle_v2i64_v8i64__12_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12130,7 +12130,7 @@ define void @v_shuffle_v2i64_v8i64__13_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_13(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_13(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12176,7 +12176,7 @@ define void @v_shuffle_v2i64_v8i64__14_13(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12216,7 +12216,7 @@ define void @v_shuffle_v2i64_v8i64__u_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12272,7 +12272,7 @@ define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12328,7 +12328,7 @@ define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12384,7 +12384,7 @@ define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12440,7 +12440,7 @@ define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12496,7 +12496,7 @@ define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12552,7 +12552,7 @@ define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12608,7 +12608,7 @@ define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12663,7 +12663,7 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12709,7 +12709,7 @@ define void @v_shuffle_v2i64_v8i64__8_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12755,7 +12755,7 @@ define void @v_shuffle_v2i64_v8i64__9_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12801,7 +12801,7 @@ define void @v_shuffle_v2i64_v8i64__10_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12847,7 +12847,7 @@ define void @v_shuffle_v2i64_v8i64__11_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12893,7 +12893,7 @@ define void @v_shuffle_v2i64_v8i64__12_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12933,7 +12933,7 @@ define void @v_shuffle_v2i64_v8i64__13_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_14(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_14(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12979,7 +12979,7 @@ define void @v_shuffle_v2i64_v8i64__14_14(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__u_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__u_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13019,7 +13019,7 @@ define void @v_shuffle_v2i64_v8i64__u_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13075,7 +13075,7 @@ define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13131,7 +13131,7 @@ define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13187,7 +13187,7 @@ define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13243,7 +13243,7 @@ define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13299,7 +13299,7 @@ define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13355,7 +13355,7 @@ define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13411,7 +13411,7 @@ define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13466,7 +13466,7 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__8_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__8_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13512,7 +13512,7 @@ define void @v_shuffle_v2i64_v8i64__8_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__9_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__9_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13558,7 +13558,7 @@ define void @v_shuffle_v2i64_v8i64__9_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__10_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__10_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13604,7 +13604,7 @@ define void @v_shuffle_v2i64_v8i64__10_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__11_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__11_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13650,7 +13650,7 @@ define void @v_shuffle_v2i64_v8i64__11_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__12_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__12_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13696,7 +13696,7 @@ define void @v_shuffle_v2i64_v8i64__12_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__13_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__13_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13742,7 +13742,7 @@ define void @v_shuffle_v2i64_v8i64__13_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @v_shuffle_v2i64_v8i64__14_15(ptr addrspace(1) inreg %ptr) {
+define void @v_shuffle_v2i64_v8i64__14_15(ptr addrspace(1) inreg %ptr) #0 {
 ; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13782,7 +13782,7 @@ define void @v_shuffle_v2i64_v8i64__14_15(ptr addrspace(1) inreg %ptr) {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_u() {
+define void @s_shuffle_v2i64_v8i64__u_u() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__u_u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13796,7 +13796,7 @@ define void @s_shuffle_v2i64_v8i64__u_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_u() {
+define void @s_shuffle_v2i64_v8i64__0_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13836,7 +13836,7 @@ define void @s_shuffle_v2i64_v8i64__0_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_u() {
+define void @s_shuffle_v2i64_v8i64__1_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13881,7 +13881,7 @@ define void @s_shuffle_v2i64_v8i64__1_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_u() {
+define void @s_shuffle_v2i64_v8i64__2_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13921,7 +13921,7 @@ define void @s_shuffle_v2i64_v8i64__2_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_u() {
+define void @s_shuffle_v2i64_v8i64__3_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13966,7 +13966,7 @@ define void @s_shuffle_v2i64_v8i64__3_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_u() {
+define void @s_shuffle_v2i64_v8i64__4_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14010,7 +14010,7 @@ define void @s_shuffle_v2i64_v8i64__4_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_u() {
+define void @s_shuffle_v2i64_v8i64__5_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14055,7 +14055,7 @@ define void @s_shuffle_v2i64_v8i64__5_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_u() {
+define void @s_shuffle_v2i64_v8i64__6_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14100,7 +14100,7 @@ define void @s_shuffle_v2i64_v8i64__6_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_u() {
+define void @s_shuffle_v2i64_v8i64__7_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14145,7 +14145,7 @@ define void @s_shuffle_v2i64_v8i64__7_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_u() {
+define void @s_shuffle_v2i64_v8i64__8_u() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14159,7 +14159,7 @@ define void @s_shuffle_v2i64_v8i64__8_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_u() {
+define void @s_shuffle_v2i64_v8i64__9_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14205,7 +14205,7 @@ define void @s_shuffle_v2i64_v8i64__9_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_u() {
+define void @s_shuffle_v2i64_v8i64__10_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14246,7 +14246,7 @@ define void @s_shuffle_v2i64_v8i64__10_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_u() {
+define void @s_shuffle_v2i64_v8i64__11_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14292,7 +14292,7 @@ define void @s_shuffle_v2i64_v8i64__11_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_u() {
+define void @s_shuffle_v2i64_v8i64__12_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14337,7 +14337,7 @@ define void @s_shuffle_v2i64_v8i64__12_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_u() {
+define void @s_shuffle_v2i64_v8i64__13_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14383,7 +14383,7 @@ define void @s_shuffle_v2i64_v8i64__13_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_u() {
+define void @s_shuffle_v2i64_v8i64__14_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14429,7 +14429,7 @@ define void @s_shuffle_v2i64_v8i64__14_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_u() {
+define void @s_shuffle_v2i64_v8i64__15_u() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14475,7 +14475,7 @@ define void @s_shuffle_v2i64_v8i64__15_u() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_0() {
+define void @s_shuffle_v2i64_v8i64__15_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14537,7 +14537,7 @@ define void @s_shuffle_v2i64_v8i64__15_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_1() {
+define void @s_shuffle_v2i64_v8i64__15_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14593,7 +14593,7 @@ define void @s_shuffle_v2i64_v8i64__15_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_2() {
+define void @s_shuffle_v2i64_v8i64__15_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14610,13 +14610,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s30
 ; GFX900-NEXT:    s_mov_b32 s9, s31
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14639,13 +14639,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s30
 ; GFX90A-NEXT:    s_mov_b32 s9, s31
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14677,7 +14677,7 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_3() {
+define void @s_shuffle_v2i64_v8i64__15_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14733,7 +14733,7 @@ define void @s_shuffle_v2i64_v8i64__15_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_4() {
+define void @s_shuffle_v2i64_v8i64__15_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14750,13 +14750,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s30
 ; GFX900-NEXT:    s_mov_b32 s9, s31
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14779,13 +14779,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s30
 ; GFX90A-NEXT:    s_mov_b32 s9, s31
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14802,19 +14802,19 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s30
 ; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -14827,7 +14827,7 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_5() {
+define void @s_shuffle_v2i64_v8i64__15_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14845,12 +14845,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s30
 ; GFX900-NEXT:    s_mov_b32 s13, s31
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14874,12 +14874,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s30
 ; GFX90A-NEXT:    s_mov_b32 s13, s31
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -14909,7 +14909,7 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_6() {
+define void @s_shuffle_v2i64_v8i64__15_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14999,22 +14999,22 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s30
 ; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -15027,7 +15027,7 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_7() {
+define void @s_shuffle_v2i64_v8i64__15_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15120,6 +15120,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -15127,12 +15128,12 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s30
 ; GFX942-NEXT:    s_mov_b32 s13, s31
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -15145,7 +15146,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_8() {
+define void @s_shuffle_v2i64_v8i64__15_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15197,7 +15198,7 @@ define void @s_shuffle_v2i64_v8i64__15_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_9() {
+define void @s_shuffle_v2i64_v8i64__15_9() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_9:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15217,7 +15218,7 @@ define void @s_shuffle_v2i64_v8i64__15_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_10() {
+define void @s_shuffle_v2i64_v8i64__15_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15269,7 +15270,7 @@ define void @s_shuffle_v2i64_v8i64__15_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_11() {
+define void @s_shuffle_v2i64_v8i64__15_11() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_11:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15289,7 +15290,7 @@ define void @s_shuffle_v2i64_v8i64__15_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_12() {
+define void @s_shuffle_v2i64_v8i64__15_12() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_12:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15311,7 +15312,7 @@ define void @s_shuffle_v2i64_v8i64__15_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_13() {
+define void @s_shuffle_v2i64_v8i64__15_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15361,7 +15362,7 @@ define void @s_shuffle_v2i64_v8i64__15_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_14() {
+define void @s_shuffle_v2i64_v8i64__15_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15413,7 +15414,7 @@ define void @s_shuffle_v2i64_v8i64__15_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__15_15() {
+define void @s_shuffle_v2i64_v8i64__15_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15465,7 +15466,7 @@ define void @s_shuffle_v2i64_v8i64__15_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_0() {
+define void @s_shuffle_v2i64_v8i64__u_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15510,7 +15511,7 @@ define void @s_shuffle_v2i64_v8i64__u_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_0() {
+define void @s_shuffle_v2i64_v8i64__0_0() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15529,7 +15530,7 @@ define void @s_shuffle_v2i64_v8i64__0_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_0() {
+define void @s_shuffle_v2i64_v8i64__1_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15580,7 +15581,7 @@ define void @s_shuffle_v2i64_v8i64__1_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_0() {
+define void @s_shuffle_v2i64_v8i64__2_0() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15599,7 +15600,7 @@ define void @s_shuffle_v2i64_v8i64__2_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_0() {
+define void @s_shuffle_v2i64_v8i64__3_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15650,7 +15651,7 @@ define void @s_shuffle_v2i64_v8i64__3_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_0() {
+define void @s_shuffle_v2i64_v8i64__4_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15699,7 +15700,7 @@ define void @s_shuffle_v2i64_v8i64__4_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_0() {
+define void @s_shuffle_v2i64_v8i64__5_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15750,7 +15751,7 @@ define void @s_shuffle_v2i64_v8i64__5_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_0() {
+define void @s_shuffle_v2i64_v8i64__6_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15801,7 +15802,7 @@ define void @s_shuffle_v2i64_v8i64__6_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_0() {
+define void @s_shuffle_v2i64_v8i64__7_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15852,7 +15853,7 @@ define void @s_shuffle_v2i64_v8i64__7_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_0() {
+define void @s_shuffle_v2i64_v8i64__8_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15897,7 +15898,7 @@ define void @s_shuffle_v2i64_v8i64__8_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_0() {
+define void @s_shuffle_v2i64_v8i64__9_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15959,7 +15960,7 @@ define void @s_shuffle_v2i64_v8i64__9_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_0() {
+define void @s_shuffle_v2i64_v8i64__10_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16061,7 +16062,7 @@ define void @s_shuffle_v2i64_v8i64__10_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_0() {
+define void @s_shuffle_v2i64_v8i64__11_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16123,7 +16124,7 @@ define void @s_shuffle_v2i64_v8i64__11_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_0() {
+define void @s_shuffle_v2i64_v8i64__12_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16167,20 +16168,21 @@ define void @s_shuffle_v2i64_v8i64__12_0() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s10, s16
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s17
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -16193,7 +16195,7 @@ define void @s_shuffle_v2i64_v8i64__12_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_0() {
+define void @s_shuffle_v2i64_v8i64__13_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16255,7 +16257,7 @@ define void @s_shuffle_v2i64_v8i64__13_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_0() {
+define void @s_shuffle_v2i64_v8i64__14_0() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16317,7 +16319,7 @@ define void @s_shuffle_v2i64_v8i64__14_0() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_1() {
+define void @s_shuffle_v2i64_v8i64__u_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16357,7 +16359,7 @@ define void @s_shuffle_v2i64_v8i64__u_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_1() {
+define void @s_shuffle_v2i64_v8i64__0_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16397,7 +16399,7 @@ define void @s_shuffle_v2i64_v8i64__0_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_1() {
+define void @s_shuffle_v2i64_v8i64__1_1() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16416,7 +16418,7 @@ define void @s_shuffle_v2i64_v8i64__1_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_1() {
+define void @s_shuffle_v2i64_v8i64__2_1() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16435,7 +16437,7 @@ define void @s_shuffle_v2i64_v8i64__2_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_1() {
+define void @s_shuffle_v2i64_v8i64__3_1() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16454,7 +16456,7 @@ define void @s_shuffle_v2i64_v8i64__3_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_1() {
+define void @s_shuffle_v2i64_v8i64__4_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16503,7 +16505,7 @@ define void @s_shuffle_v2i64_v8i64__4_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_1() {
+define void @s_shuffle_v2i64_v8i64__5_1() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16522,7 +16524,7 @@ define void @s_shuffle_v2i64_v8i64__5_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_1() {
+define void @s_shuffle_v2i64_v8i64__6_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16573,7 +16575,7 @@ define void @s_shuffle_v2i64_v8i64__6_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_1() {
+define void @s_shuffle_v2i64_v8i64__7_1() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16592,7 +16594,7 @@ define void @s_shuffle_v2i64_v8i64__7_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_1() {
+define void @s_shuffle_v2i64_v8i64__8_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16632,7 +16634,7 @@ define void @s_shuffle_v2i64_v8i64__8_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_1() {
+define void @s_shuffle_v2i64_v8i64__9_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16688,7 +16690,7 @@ define void @s_shuffle_v2i64_v8i64__9_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_1() {
+define void @s_shuffle_v2i64_v8i64__10_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16790,7 +16792,7 @@ define void @s_shuffle_v2i64_v8i64__10_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_1() {
+define void @s_shuffle_v2i64_v8i64__11_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16846,7 +16848,7 @@ define void @s_shuffle_v2i64_v8i64__11_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_1() {
+define void @s_shuffle_v2i64_v8i64__12_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16890,20 +16892,21 @@ define void @s_shuffle_v2i64_v8i64__12_1() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s10, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s19
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -16916,7 +16919,7 @@ define void @s_shuffle_v2i64_v8i64__12_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_1() {
+define void @s_shuffle_v2i64_v8i64__13_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16972,7 +16975,7 @@ define void @s_shuffle_v2i64_v8i64__13_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_1() {
+define void @s_shuffle_v2i64_v8i64__14_1() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17034,7 +17037,7 @@ define void @s_shuffle_v2i64_v8i64__14_1() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_2() {
+define void @s_shuffle_v2i64_v8i64__u_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17079,7 +17082,7 @@ define void @s_shuffle_v2i64_v8i64__u_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_2() {
+define void @s_shuffle_v2i64_v8i64__0_2() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17098,7 +17101,7 @@ define void @s_shuffle_v2i64_v8i64__0_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_2() {
+define void @s_shuffle_v2i64_v8i64__1_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17149,7 +17152,7 @@ define void @s_shuffle_v2i64_v8i64__1_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_2() {
+define void @s_shuffle_v2i64_v8i64__2_2() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17168,7 +17171,7 @@ define void @s_shuffle_v2i64_v8i64__2_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_2() {
+define void @s_shuffle_v2i64_v8i64__3_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17219,7 +17222,7 @@ define void @s_shuffle_v2i64_v8i64__3_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_2() {
+define void @s_shuffle_v2i64_v8i64__4_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17268,7 +17271,7 @@ define void @s_shuffle_v2i64_v8i64__4_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_2() {
+define void @s_shuffle_v2i64_v8i64__5_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17319,7 +17322,7 @@ define void @s_shuffle_v2i64_v8i64__5_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_2() {
+define void @s_shuffle_v2i64_v8i64__6_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17370,7 +17373,7 @@ define void @s_shuffle_v2i64_v8i64__6_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_2() {
+define void @s_shuffle_v2i64_v8i64__7_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17421,7 +17424,7 @@ define void @s_shuffle_v2i64_v8i64__7_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_2() {
+define void @s_shuffle_v2i64_v8i64__8_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17466,7 +17469,7 @@ define void @s_shuffle_v2i64_v8i64__8_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_2() {
+define void @s_shuffle_v2i64_v8i64__9_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17481,6 +17484,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17489,7 +17493,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17510,6 +17513,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17518,7 +17522,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17550,7 +17553,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_2() {
+define void @s_shuffle_v2i64_v8i64__10_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17565,13 +17568,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s20
 ; GFX900-NEXT:    s_mov_b32 s11, s21
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17592,13 +17595,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s20
 ; GFX90A-NEXT:    s_mov_b32 s11, s21
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17612,6 +17615,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -17620,13 +17624,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s10, s20
 ; GFX942-NEXT:    s_mov_b32 s11, s21
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -17639,7 +17643,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_2() {
+define void @s_shuffle_v2i64_v8i64__11_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17654,6 +17658,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17662,7 +17667,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17683,6 +17687,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17691,7 +17696,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17723,7 +17727,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_2() {
+define void @s_shuffle_v2i64_v8i64__12_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17783,7 +17787,7 @@ define void @s_shuffle_v2i64_v8i64__12_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_2() {
+define void @s_shuffle_v2i64_v8i64__13_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17798,6 +17802,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -17806,7 +17811,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17827,6 +17831,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -17835,7 +17840,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -17867,7 +17871,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_2() {
+define void @s_shuffle_v2i64_v8i64__14_2() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17929,7 +17933,7 @@ define void @s_shuffle_v2i64_v8i64__14_2() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_3() {
+define void @s_shuffle_v2i64_v8i64__u_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17969,7 +17973,7 @@ define void @s_shuffle_v2i64_v8i64__u_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_3() {
+define void @s_shuffle_v2i64_v8i64__0_3() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17988,7 +17992,7 @@ define void @s_shuffle_v2i64_v8i64__0_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_3() {
+define void @s_shuffle_v2i64_v8i64__1_3() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18007,7 +18011,7 @@ define void @s_shuffle_v2i64_v8i64__1_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_3() {
+define void @s_shuffle_v2i64_v8i64__2_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18047,7 +18051,7 @@ define void @s_shuffle_v2i64_v8i64__2_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_3() {
+define void @s_shuffle_v2i64_v8i64__3_3() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18066,7 +18070,7 @@ define void @s_shuffle_v2i64_v8i64__3_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_3() {
+define void @s_shuffle_v2i64_v8i64__4_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18115,7 +18119,7 @@ define void @s_shuffle_v2i64_v8i64__4_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_3() {
+define void @s_shuffle_v2i64_v8i64__5_3() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18134,7 +18138,7 @@ define void @s_shuffle_v2i64_v8i64__5_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_3() {
+define void @s_shuffle_v2i64_v8i64__6_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18185,7 +18189,7 @@ define void @s_shuffle_v2i64_v8i64__6_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_3() {
+define void @s_shuffle_v2i64_v8i64__7_3() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18204,7 +18208,7 @@ define void @s_shuffle_v2i64_v8i64__7_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_3() {
+define void @s_shuffle_v2i64_v8i64__8_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18244,7 +18248,7 @@ define void @s_shuffle_v2i64_v8i64__8_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_3() {
+define void @s_shuffle_v2i64_v8i64__9_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18300,7 +18304,7 @@ define void @s_shuffle_v2i64_v8i64__9_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_3() {
+define void @s_shuffle_v2i64_v8i64__10_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18315,13 +18319,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s10, s22
 ; GFX900-NEXT:    s_mov_b32 s11, s23
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18342,13 +18346,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s10, s22
 ; GFX90A-NEXT:    s_mov_b32 s11, s23
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18362,6 +18366,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -18370,13 +18375,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s10, s22
 ; GFX942-NEXT:    s_mov_b32 s11, s23
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -18389,7 +18394,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_3() {
+define void @s_shuffle_v2i64_v8i64__11_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18445,7 +18450,7 @@ define void @s_shuffle_v2i64_v8i64__11_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_3() {
+define void @s_shuffle_v2i64_v8i64__12_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18505,7 +18510,7 @@ define void @s_shuffle_v2i64_v8i64__12_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_3() {
+define void @s_shuffle_v2i64_v8i64__13_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18561,7 +18566,7 @@ define void @s_shuffle_v2i64_v8i64__13_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_3() {
+define void @s_shuffle_v2i64_v8i64__14_3() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18623,7 +18628,7 @@ define void @s_shuffle_v2i64_v8i64__14_3() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_4() {
+define void @s_shuffle_v2i64_v8i64__u_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18668,7 +18673,7 @@ define void @s_shuffle_v2i64_v8i64__u_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_4() {
+define void @s_shuffle_v2i64_v8i64__0_4() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18687,7 +18692,7 @@ define void @s_shuffle_v2i64_v8i64__0_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_4() {
+define void @s_shuffle_v2i64_v8i64__1_4() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18708,7 +18713,7 @@ define void @s_shuffle_v2i64_v8i64__1_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_4() {
+define void @s_shuffle_v2i64_v8i64__2_4() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18727,7 +18732,7 @@ define void @s_shuffle_v2i64_v8i64__2_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_4() {
+define void @s_shuffle_v2i64_v8i64__3_4() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18748,7 +18753,7 @@ define void @s_shuffle_v2i64_v8i64__3_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_4() {
+define void @s_shuffle_v2i64_v8i64__4_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18797,7 +18802,7 @@ define void @s_shuffle_v2i64_v8i64__4_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_4() {
+define void @s_shuffle_v2i64_v8i64__5_4() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18818,7 +18823,7 @@ define void @s_shuffle_v2i64_v8i64__5_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_4() {
+define void @s_shuffle_v2i64_v8i64__6_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18869,7 +18874,7 @@ define void @s_shuffle_v2i64_v8i64__6_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_4() {
+define void @s_shuffle_v2i64_v8i64__7_4() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18890,7 +18895,7 @@ define void @s_shuffle_v2i64_v8i64__7_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_4() {
+define void @s_shuffle_v2i64_v8i64__8_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18935,7 +18940,7 @@ define void @s_shuffle_v2i64_v8i64__8_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_4() {
+define void @s_shuffle_v2i64_v8i64__9_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18950,6 +18955,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -18958,7 +18964,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -18979,6 +18984,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -18987,7 +18993,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19004,19 +19009,19 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19029,7 +19034,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_4() {
+define void @s_shuffle_v2i64_v8i64__10_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19085,7 +19090,7 @@ define void @s_shuffle_v2i64_v8i64__10_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_4() {
+define void @s_shuffle_v2i64_v8i64__11_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19100,6 +19105,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -19108,7 +19114,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19129,6 +19134,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -19137,7 +19143,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19154,19 +19159,19 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s22
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19179,7 +19184,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_4() {
+define void @s_shuffle_v2i64_v8i64__12_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19197,12 +19202,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
 ; GFX900-NEXT:    s_mov_b32 s26, s12
 ; GFX900-NEXT:    s_mov_b32 s27, s13
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19226,12 +19231,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
 ; GFX90A-NEXT:    s_mov_b32 s26, s12
 ; GFX90A-NEXT:    s_mov_b32 s27, s13
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19261,7 +19266,7 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_4() {
+define void @s_shuffle_v2i64_v8i64__13_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19276,6 +19281,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -19284,7 +19290,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19305,6 +19310,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -19313,7 +19319,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19330,19 +19335,19 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s26
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -19355,7 +19360,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_4() {
+define void @s_shuffle_v2i64_v8i64__14_4() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19374,11 +19379,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
 ; GFX900-NEXT:    s_mov_b32 s31, s13
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19403,11 +19408,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
 ; GFX90A-NEXT:    s_mov_b32 s31, s13
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19439,7 +19444,7 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_5() {
+define void @s_shuffle_v2i64_v8i64__u_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19483,7 +19488,7 @@ define void @s_shuffle_v2i64_v8i64__u_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_5() {
+define void @s_shuffle_v2i64_v8i64__0_5() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19502,7 +19507,7 @@ define void @s_shuffle_v2i64_v8i64__0_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_5() {
+define void @s_shuffle_v2i64_v8i64__1_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19551,7 +19556,7 @@ define void @s_shuffle_v2i64_v8i64__1_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_5() {
+define void @s_shuffle_v2i64_v8i64__2_5() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19570,7 +19575,7 @@ define void @s_shuffle_v2i64_v8i64__2_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_5() {
+define void @s_shuffle_v2i64_v8i64__3_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19619,7 +19624,7 @@ define void @s_shuffle_v2i64_v8i64__3_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_5() {
+define void @s_shuffle_v2i64_v8i64__4_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19663,7 +19668,7 @@ define void @s_shuffle_v2i64_v8i64__4_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_5() {
+define void @s_shuffle_v2i64_v8i64__5_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19712,7 +19717,7 @@ define void @s_shuffle_v2i64_v8i64__5_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_5() {
+define void @s_shuffle_v2i64_v8i64__6_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19763,7 +19768,7 @@ define void @s_shuffle_v2i64_v8i64__6_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_5() {
+define void @s_shuffle_v2i64_v8i64__7_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19812,7 +19817,7 @@ define void @s_shuffle_v2i64_v8i64__7_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_5() {
+define void @s_shuffle_v2i64_v8i64__8_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19856,7 +19861,7 @@ define void @s_shuffle_v2i64_v8i64__8_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_5() {
+define void @s_shuffle_v2i64_v8i64__9_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19874,12 +19879,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s18
 ; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19903,12 +19908,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s18
 ; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -19938,7 +19943,7 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_5() {
+define void @s_shuffle_v2i64_v8i64__10_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19994,7 +19999,7 @@ define void @s_shuffle_v2i64_v8i64__10_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_5() {
+define void @s_shuffle_v2i64_v8i64__11_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20012,12 +20017,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s22
 ; GFX900-NEXT:    s_mov_b32 s13, s23
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20041,12 +20046,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s22
 ; GFX90A-NEXT:    s_mov_b32 s13, s23
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20076,7 +20081,7 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_5() {
+define void @s_shuffle_v2i64_v8i64__12_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20094,12 +20099,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
 ; GFX900-NEXT:    s_mov_b32 s26, s14
 ; GFX900-NEXT:    s_mov_b32 s27, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20123,12 +20128,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
 ; GFX90A-NEXT:    s_mov_b32 s26, s14
 ; GFX90A-NEXT:    s_mov_b32 s27, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20158,7 +20163,7 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_5() {
+define void @s_shuffle_v2i64_v8i64__13_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20176,12 +20181,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
 ; GFX900-NEXT:    s_mov_b32 s12, s26
 ; GFX900-NEXT:    s_mov_b32 s13, s27
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20205,12 +20210,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
 ; GFX90A-NEXT:    s_mov_b32 s12, s26
 ; GFX90A-NEXT:    s_mov_b32 s13, s27
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20240,7 +20245,7 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_5() {
+define void @s_shuffle_v2i64_v8i64__14_5() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20259,11 +20264,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
 ; GFX900-NEXT:    s_mov_b32 s31, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20288,11 +20293,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
 ; GFX90A-NEXT:    s_mov_b32 s31, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20324,7 +20329,7 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_6() {
+define void @s_shuffle_v2i64_v8i64__u_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20369,7 +20374,7 @@ define void @s_shuffle_v2i64_v8i64__u_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_6() {
+define void @s_shuffle_v2i64_v8i64__0_6() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_6:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20388,7 +20393,7 @@ define void @s_shuffle_v2i64_v8i64__0_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_6() {
+define void @s_shuffle_v2i64_v8i64__1_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20439,7 +20444,7 @@ define void @s_shuffle_v2i64_v8i64__1_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_6() {
+define void @s_shuffle_v2i64_v8i64__2_6() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_6:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20458,7 +20463,7 @@ define void @s_shuffle_v2i64_v8i64__2_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_6() {
+define void @s_shuffle_v2i64_v8i64__3_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20509,7 +20514,7 @@ define void @s_shuffle_v2i64_v8i64__3_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_6() {
+define void @s_shuffle_v2i64_v8i64__4_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20558,7 +20563,7 @@ define void @s_shuffle_v2i64_v8i64__4_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_6() {
+define void @s_shuffle_v2i64_v8i64__5_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20609,7 +20614,7 @@ define void @s_shuffle_v2i64_v8i64__5_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_6() {
+define void @s_shuffle_v2i64_v8i64__6_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20660,7 +20665,7 @@ define void @s_shuffle_v2i64_v8i64__6_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_6() {
+define void @s_shuffle_v2i64_v8i64__7_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20711,7 +20716,7 @@ define void @s_shuffle_v2i64_v8i64__7_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_6() {
+define void @s_shuffle_v2i64_v8i64__8_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20756,7 +20761,7 @@ define void @s_shuffle_v2i64_v8i64__8_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_6() {
+define void @s_shuffle_v2i64_v8i64__9_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20846,22 +20851,22 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -20874,7 +20879,7 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_6() {
+define void @s_shuffle_v2i64_v8i64__10_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20930,7 +20935,7 @@ define void @s_shuffle_v2i64_v8i64__10_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_6() {
+define void @s_shuffle_v2i64_v8i64__11_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21020,22 +21025,22 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s22
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21048,7 +21053,7 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_6() {
+define void @s_shuffle_v2i64_v8i64__12_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21154,7 +21159,7 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_6() {
+define void @s_shuffle_v2i64_v8i64__13_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21244,22 +21249,22 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s10, s12
+; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s26
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21272,7 +21277,7 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_6() {
+define void @s_shuffle_v2i64_v8i64__14_6() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21362,10 +21367,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21373,11 +21379,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX942-NEXT:    s_mov_b32 s31, s13
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21390,7 +21396,7 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_7() {
+define void @s_shuffle_v2i64_v8i64__u_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21435,7 +21441,7 @@ define void @s_shuffle_v2i64_v8i64__u_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_7() {
+define void @s_shuffle_v2i64_v8i64__0_7() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21454,7 +21460,7 @@ define void @s_shuffle_v2i64_v8i64__0_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_7() {
+define void @s_shuffle_v2i64_v8i64__1_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21505,7 +21511,7 @@ define void @s_shuffle_v2i64_v8i64__1_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_7() {
+define void @s_shuffle_v2i64_v8i64__2_7() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21524,7 +21530,7 @@ define void @s_shuffle_v2i64_v8i64__2_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_7() {
+define void @s_shuffle_v2i64_v8i64__3_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21575,7 +21581,7 @@ define void @s_shuffle_v2i64_v8i64__3_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_7() {
+define void @s_shuffle_v2i64_v8i64__4_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21624,7 +21630,7 @@ define void @s_shuffle_v2i64_v8i64__4_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_7() {
+define void @s_shuffle_v2i64_v8i64__5_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21675,7 +21681,7 @@ define void @s_shuffle_v2i64_v8i64__5_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_7() {
+define void @s_shuffle_v2i64_v8i64__6_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21720,7 +21726,7 @@ define void @s_shuffle_v2i64_v8i64__6_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_7() {
+define void @s_shuffle_v2i64_v8i64__7_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21771,7 +21777,7 @@ define void @s_shuffle_v2i64_v8i64__7_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_7() {
+define void @s_shuffle_v2i64_v8i64__8_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21816,7 +21822,7 @@ define void @s_shuffle_v2i64_v8i64__8_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_7() {
+define void @s_shuffle_v2i64_v8i64__9_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21909,6 +21915,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -21916,12 +21923,12 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s18
 ; GFX942-NEXT:    s_mov_b32 s13, s19
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -21934,7 +21941,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_7() {
+define void @s_shuffle_v2i64_v8i64__10_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21990,7 +21997,7 @@ define void @s_shuffle_v2i64_v8i64__10_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_7() {
+define void @s_shuffle_v2i64_v8i64__11_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22083,6 +22090,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -22090,12 +22098,12 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s22
 ; GFX942-NEXT:    s_mov_b32 s13, s23
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22108,7 +22116,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_7() {
+define void @s_shuffle_v2i64_v8i64__12_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22214,7 +22222,7 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_7() {
+define void @s_shuffle_v2i64_v8i64__13_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22307,6 +22315,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -22314,12 +22323,12 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX942-NEXT:    s_mov_b32 s12, s26
 ; GFX942-NEXT:    s_mov_b32 s13, s27
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22332,7 +22341,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_7() {
+define void @s_shuffle_v2i64_v8i64__14_7() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_7:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22422,10 +22431,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22433,11 +22443,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX942-NEXT:    s_mov_b32 s31, s15
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -22450,7 +22460,7 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_8() {
+define void @s_shuffle_v2i64_v8i64__u_8() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__u_8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22464,7 +22474,7 @@ define void @s_shuffle_v2i64_v8i64__u_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_8() {
+define void @s_shuffle_v2i64_v8i64__0_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22504,7 +22514,7 @@ define void @s_shuffle_v2i64_v8i64__0_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_8() {
+define void @s_shuffle_v2i64_v8i64__1_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22549,7 +22559,7 @@ define void @s_shuffle_v2i64_v8i64__1_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_8() {
+define void @s_shuffle_v2i64_v8i64__2_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22589,7 +22599,7 @@ define void @s_shuffle_v2i64_v8i64__2_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_8() {
+define void @s_shuffle_v2i64_v8i64__3_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22634,7 +22644,7 @@ define void @s_shuffle_v2i64_v8i64__3_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_8() {
+define void @s_shuffle_v2i64_v8i64__4_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22678,7 +22688,7 @@ define void @s_shuffle_v2i64_v8i64__4_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_8() {
+define void @s_shuffle_v2i64_v8i64__5_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22723,7 +22733,7 @@ define void @s_shuffle_v2i64_v8i64__5_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_8() {
+define void @s_shuffle_v2i64_v8i64__6_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22768,7 +22778,7 @@ define void @s_shuffle_v2i64_v8i64__6_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_8() {
+define void @s_shuffle_v2i64_v8i64__7_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22813,7 +22823,7 @@ define void @s_shuffle_v2i64_v8i64__7_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_8() {
+define void @s_shuffle_v2i64_v8i64__8_8() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22827,7 +22837,7 @@ define void @s_shuffle_v2i64_v8i64__8_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_8() {
+define void @s_shuffle_v2i64_v8i64__9_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22879,7 +22889,7 @@ define void @s_shuffle_v2i64_v8i64__9_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_8() {
+define void @s_shuffle_v2i64_v8i64__10_8() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22899,7 +22909,7 @@ define void @s_shuffle_v2i64_v8i64__10_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_8() {
+define void @s_shuffle_v2i64_v8i64__11_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22951,7 +22961,7 @@ define void @s_shuffle_v2i64_v8i64__11_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_8() {
+define void @s_shuffle_v2i64_v8i64__12_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23001,7 +23011,7 @@ define void @s_shuffle_v2i64_v8i64__12_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_8() {
+define void @s_shuffle_v2i64_v8i64__13_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23053,7 +23063,7 @@ define void @s_shuffle_v2i64_v8i64__13_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_8() {
+define void @s_shuffle_v2i64_v8i64__14_8() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_8:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23105,7 +23115,7 @@ define void @s_shuffle_v2i64_v8i64__14_8() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_9() {
+define void @s_shuffle_v2i64_v8i64__u_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23146,7 +23156,7 @@ define void @s_shuffle_v2i64_v8i64__u_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_9() {
+define void @s_shuffle_v2i64_v8i64__0_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23202,7 +23212,7 @@ define void @s_shuffle_v2i64_v8i64__0_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_9() {
+define void @s_shuffle_v2i64_v8i64__1_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23258,7 +23268,7 @@ define void @s_shuffle_v2i64_v8i64__1_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_9() {
+define void @s_shuffle_v2i64_v8i64__2_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23314,7 +23324,7 @@ define void @s_shuffle_v2i64_v8i64__2_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_9() {
+define void @s_shuffle_v2i64_v8i64__3_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23416,7 +23426,7 @@ define void @s_shuffle_v2i64_v8i64__3_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_9() {
+define void @s_shuffle_v2i64_v8i64__4_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23434,12 +23444,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
 ; GFX900-NEXT:    s_mov_b32 s14, s18
 ; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23463,12 +23473,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s18
 ; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23498,7 +23508,7 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_9() {
+define void @s_shuffle_v2i64_v8i64__5_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23513,13 +23523,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:23]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s26
 ; GFX900-NEXT:    s_mov_b32 s9, s27
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23540,13 +23550,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:23]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s26
 ; GFX90A-NEXT:    s_mov_b32 s9, s27
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -23560,6 +23570,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -23568,13 +23579,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s8, s26
 ; GFX942-NEXT:    s_mov_b32 s9, s27
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -23587,7 +23598,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_9() {
+define void @s_shuffle_v2i64_v8i64__6_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23680,6 +23691,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -23687,12 +23699,12 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX942-NEXT:    s_mov_b32 s14, s18
 ; GFX942-NEXT:    s_mov_b32 s15, s19
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -23705,7 +23717,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_9() {
+define void @s_shuffle_v2i64_v8i64__7_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23761,7 +23773,7 @@ define void @s_shuffle_v2i64_v8i64__7_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_9() {
+define void @s_shuffle_v2i64_v8i64__8_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23802,7 +23814,7 @@ define void @s_shuffle_v2i64_v8i64__8_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_9() {
+define void @s_shuffle_v2i64_v8i64__9_9() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_9:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23822,7 +23834,7 @@ define void @s_shuffle_v2i64_v8i64__9_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_9() {
+define void @s_shuffle_v2i64_v8i64__10_9() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_9:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23842,7 +23854,7 @@ define void @s_shuffle_v2i64_v8i64__10_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_9() {
+define void @s_shuffle_v2i64_v8i64__11_9() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_9:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23862,7 +23874,7 @@ define void @s_shuffle_v2i64_v8i64__11_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_9() {
+define void @s_shuffle_v2i64_v8i64__12_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23912,7 +23924,7 @@ define void @s_shuffle_v2i64_v8i64__12_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_9() {
+define void @s_shuffle_v2i64_v8i64__13_9() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_9:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23932,7 +23944,7 @@ define void @s_shuffle_v2i64_v8i64__13_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_9() {
+define void @s_shuffle_v2i64_v8i64__14_9() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_9:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23984,7 +23996,7 @@ define void @s_shuffle_v2i64_v8i64__14_9() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_10() {
+define void @s_shuffle_v2i64_v8i64__u_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24030,7 +24042,7 @@ define void @s_shuffle_v2i64_v8i64__u_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_10() {
+define void @s_shuffle_v2i64_v8i64__0_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24086,7 +24098,7 @@ define void @s_shuffle_v2i64_v8i64__0_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_10() {
+define void @s_shuffle_v2i64_v8i64__1_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24148,7 +24160,7 @@ define void @s_shuffle_v2i64_v8i64__1_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_10() {
+define void @s_shuffle_v2i64_v8i64__2_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24204,7 +24216,7 @@ define void @s_shuffle_v2i64_v8i64__2_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_10() {
+define void @s_shuffle_v2i64_v8i64__3_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24266,7 +24278,7 @@ define void @s_shuffle_v2i64_v8i64__3_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_10() {
+define void @s_shuffle_v2i64_v8i64__4_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24284,12 +24296,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
 ; GFX900-NEXT:    s_mov_b32 s14, s20
 ; GFX900-NEXT:    s_mov_b32 s15, s21
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24313,12 +24325,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s20
 ; GFX90A-NEXT:    s_mov_b32 s15, s21
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24348,7 +24360,7 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_10() {
+define void @s_shuffle_v2i64_v8i64__5_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24363,6 +24375,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s20
@@ -24371,7 +24384,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24392,6 +24404,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s20
@@ -24400,7 +24413,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -24431,7 +24443,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_10() {
+define void @s_shuffle_v2i64_v8i64__6_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24524,6 +24536,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -24531,12 +24544,12 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX942-NEXT:    s_mov_b32 s14, s20
 ; GFX942-NEXT:    s_mov_b32 s15, s21
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -24549,7 +24562,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_10() {
+define void @s_shuffle_v2i64_v8i64__7_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24639,22 +24652,22 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s20
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s21
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -24667,7 +24680,7 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_10() {
+define void @s_shuffle_v2i64_v8i64__8_10() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_10:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24687,7 +24700,7 @@ define void @s_shuffle_v2i64_v8i64__8_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_10() {
+define void @s_shuffle_v2i64_v8i64__9_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24739,7 +24752,7 @@ define void @s_shuffle_v2i64_v8i64__9_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_10() {
+define void @s_shuffle_v2i64_v8i64__10_10() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_10:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24759,7 +24772,7 @@ define void @s_shuffle_v2i64_v8i64__10_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_10() {
+define void @s_shuffle_v2i64_v8i64__11_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24811,7 +24824,7 @@ define void @s_shuffle_v2i64_v8i64__11_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_10() {
+define void @s_shuffle_v2i64_v8i64__12_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24861,7 +24874,7 @@ define void @s_shuffle_v2i64_v8i64__12_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_10() {
+define void @s_shuffle_v2i64_v8i64__13_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24913,7 +24926,7 @@ define void @s_shuffle_v2i64_v8i64__13_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_10() {
+define void @s_shuffle_v2i64_v8i64__14_10() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_10:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24965,7 +24978,7 @@ define void @s_shuffle_v2i64_v8i64__14_10() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_11() {
+define void @s_shuffle_v2i64_v8i64__u_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25006,7 +25019,7 @@ define void @s_shuffle_v2i64_v8i64__u_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_11() {
+define void @s_shuffle_v2i64_v8i64__0_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25062,7 +25075,7 @@ define void @s_shuffle_v2i64_v8i64__0_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_11() {
+define void @s_shuffle_v2i64_v8i64__1_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25164,7 +25177,7 @@ define void @s_shuffle_v2i64_v8i64__1_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_11() {
+define void @s_shuffle_v2i64_v8i64__2_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25220,7 +25233,7 @@ define void @s_shuffle_v2i64_v8i64__2_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_11() {
+define void @s_shuffle_v2i64_v8i64__3_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25235,13 +25248,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25262,13 +25275,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25282,6 +25295,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -25290,13 +25304,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s8, s22
 ; GFX942-NEXT:    s_mov_b32 s9, s23
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -25309,7 +25323,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_11() {
+define void @s_shuffle_v2i64_v8i64__4_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25327,12 +25341,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
 ; GFX900-NEXT:    s_mov_b32 s14, s22
 ; GFX900-NEXT:    s_mov_b32 s15, s23
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25356,12 +25370,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s22
 ; GFX90A-NEXT:    s_mov_b32 s15, s23
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25391,7 +25405,7 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_11() {
+define void @s_shuffle_v2i64_v8i64__5_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25447,7 +25461,7 @@ define void @s_shuffle_v2i64_v8i64__5_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_11() {
+define void @s_shuffle_v2i64_v8i64__6_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25540,6 +25554,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -25547,12 +25562,12 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX942-NEXT:    s_mov_b32 s14, s22
 ; GFX942-NEXT:    s_mov_b32 s15, s23
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -25565,7 +25580,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_11() {
+define void @s_shuffle_v2i64_v8i64__7_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25621,7 +25636,7 @@ define void @s_shuffle_v2i64_v8i64__7_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_11() {
+define void @s_shuffle_v2i64_v8i64__8_11() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_11:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25641,7 +25656,7 @@ define void @s_shuffle_v2i64_v8i64__8_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_11() {
+define void @s_shuffle_v2i64_v8i64__9_11() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_11:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25661,7 +25676,7 @@ define void @s_shuffle_v2i64_v8i64__9_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_11() {
+define void @s_shuffle_v2i64_v8i64__10_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25702,7 +25717,7 @@ define void @s_shuffle_v2i64_v8i64__10_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_11() {
+define void @s_shuffle_v2i64_v8i64__11_11() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_11:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25722,7 +25737,7 @@ define void @s_shuffle_v2i64_v8i64__11_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_11() {
+define void @s_shuffle_v2i64_v8i64__12_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25772,7 +25787,7 @@ define void @s_shuffle_v2i64_v8i64__12_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_11() {
+define void @s_shuffle_v2i64_v8i64__13_11() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_11:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25792,7 +25807,7 @@ define void @s_shuffle_v2i64_v8i64__13_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_11() {
+define void @s_shuffle_v2i64_v8i64__14_11() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_11:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25844,7 +25859,7 @@ define void @s_shuffle_v2i64_v8i64__14_11() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_12() {
+define void @s_shuffle_v2i64_v8i64__u_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25890,7 +25905,7 @@ define void @s_shuffle_v2i64_v8i64__u_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_12() {
+define void @s_shuffle_v2i64_v8i64__0_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25946,7 +25961,7 @@ define void @s_shuffle_v2i64_v8i64__0_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_12() {
+define void @s_shuffle_v2i64_v8i64__1_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26008,7 +26023,7 @@ define void @s_shuffle_v2i64_v8i64__1_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_12() {
+define void @s_shuffle_v2i64_v8i64__2_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26064,7 +26079,7 @@ define void @s_shuffle_v2i64_v8i64__2_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_12() {
+define void @s_shuffle_v2i64_v8i64__3_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26126,7 +26141,7 @@ define void @s_shuffle_v2i64_v8i64__3_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_12() {
+define void @s_shuffle_v2i64_v8i64__4_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26144,12 +26159,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
 ; GFX900-NEXT:    s_mov_b32 s14, s24
 ; GFX900-NEXT:    s_mov_b32 s15, s25
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26173,12 +26188,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s24
 ; GFX90A-NEXT:    s_mov_b32 s15, s25
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26208,7 +26223,7 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_12() {
+define void @s_shuffle_v2i64_v8i64__5_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26223,6 +26238,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s24
@@ -26231,7 +26247,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26252,6 +26267,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s24
@@ -26260,7 +26276,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26291,7 +26306,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_12() {
+define void @s_shuffle_v2i64_v8i64__6_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26384,6 +26399,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -26391,12 +26407,12 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX942-NEXT:    s_mov_b32 s14, s24
 ; GFX942-NEXT:    s_mov_b32 s15, s25
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26409,7 +26425,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_12() {
+define void @s_shuffle_v2i64_v8i64__7_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26499,22 +26515,22 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s24
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s25
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26527,7 +26543,7 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_12() {
+define void @s_shuffle_v2i64_v8i64__8_12() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_12:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26547,7 +26563,7 @@ define void @s_shuffle_v2i64_v8i64__8_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_12() {
+define void @s_shuffle_v2i64_v8i64__9_12() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_12:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26569,7 +26585,7 @@ define void @s_shuffle_v2i64_v8i64__9_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_12() {
+define void @s_shuffle_v2i64_v8i64__10_12() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_12:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26589,7 +26605,7 @@ define void @s_shuffle_v2i64_v8i64__10_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_12() {
+define void @s_shuffle_v2i64_v8i64__11_12() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_12:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26611,7 +26627,7 @@ define void @s_shuffle_v2i64_v8i64__11_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_12() {
+define void @s_shuffle_v2i64_v8i64__12_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26661,7 +26677,7 @@ define void @s_shuffle_v2i64_v8i64__12_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_12() {
+define void @s_shuffle_v2i64_v8i64__13_12() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_12:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26683,7 +26699,7 @@ define void @s_shuffle_v2i64_v8i64__13_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_12() {
+define void @s_shuffle_v2i64_v8i64__14_12() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_12:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26735,7 +26751,7 @@ define void @s_shuffle_v2i64_v8i64__14_12() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_13() {
+define void @s_shuffle_v2i64_v8i64__u_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26780,7 +26796,7 @@ define void @s_shuffle_v2i64_v8i64__u_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_13() {
+define void @s_shuffle_v2i64_v8i64__0_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26836,7 +26852,7 @@ define void @s_shuffle_v2i64_v8i64__0_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_13() {
+define void @s_shuffle_v2i64_v8i64__1_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26880,20 +26896,21 @@ define void @s_shuffle_v2i64_v8i64__1_13() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s18
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s9, s19
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -26906,7 +26923,7 @@ define void @s_shuffle_v2i64_v8i64__1_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_13() {
+define void @s_shuffle_v2i64_v8i64__2_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26962,7 +26979,7 @@ define void @s_shuffle_v2i64_v8i64__2_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_13() {
+define void @s_shuffle_v2i64_v8i64__3_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27022,7 +27039,7 @@ define void @s_shuffle_v2i64_v8i64__3_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_13() {
+define void @s_shuffle_v2i64_v8i64__4_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27040,12 +27057,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
 ; GFX900-NEXT:    s_mov_b32 s14, s26
 ; GFX900-NEXT:    s_mov_b32 s15, s27
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27069,12 +27086,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s26
 ; GFX90A-NEXT:    s_mov_b32 s15, s27
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27104,7 +27121,7 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_13() {
+define void @s_shuffle_v2i64_v8i64__5_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27122,12 +27139,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
 ; GFX900-NEXT:    s_mov_b32 s24, s14
 ; GFX900-NEXT:    s_mov_b32 s25, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27151,12 +27168,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
 ; GFX90A-NEXT:    s_mov_b32 s24, s14
 ; GFX90A-NEXT:    s_mov_b32 s25, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -27186,7 +27203,7 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_13() {
+define void @s_shuffle_v2i64_v8i64__6_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27279,6 +27296,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -27286,12 +27304,12 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX942-NEXT:    s_mov_b32 s14, s26
 ; GFX942-NEXT:    s_mov_b32 s15, s27
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -27304,7 +27322,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_13() {
+define void @s_shuffle_v2i64_v8i64__7_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27410,7 +27428,7 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_13() {
+define void @s_shuffle_v2i64_v8i64__8_13() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_13:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27430,7 +27448,7 @@ define void @s_shuffle_v2i64_v8i64__8_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_13() {
+define void @s_shuffle_v2i64_v8i64__9_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27480,7 +27498,7 @@ define void @s_shuffle_v2i64_v8i64__9_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_13() {
+define void @s_shuffle_v2i64_v8i64__10_13() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_13:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27500,7 +27518,7 @@ define void @s_shuffle_v2i64_v8i64__10_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_13() {
+define void @s_shuffle_v2i64_v8i64__11_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27550,7 +27568,7 @@ define void @s_shuffle_v2i64_v8i64__11_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_13() {
+define void @s_shuffle_v2i64_v8i64__12_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27595,7 +27613,7 @@ define void @s_shuffle_v2i64_v8i64__12_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_13() {
+define void @s_shuffle_v2i64_v8i64__13_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27645,7 +27663,7 @@ define void @s_shuffle_v2i64_v8i64__13_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_13() {
+define void @s_shuffle_v2i64_v8i64__14_13() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_13:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27697,7 +27715,7 @@ define void @s_shuffle_v2i64_v8i64__14_13() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_14() {
+define void @s_shuffle_v2i64_v8i64__u_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27743,7 +27761,7 @@ define void @s_shuffle_v2i64_v8i64__u_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_14() {
+define void @s_shuffle_v2i64_v8i64__0_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27799,7 +27817,7 @@ define void @s_shuffle_v2i64_v8i64__0_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_14() {
+define void @s_shuffle_v2i64_v8i64__1_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27861,7 +27879,7 @@ define void @s_shuffle_v2i64_v8i64__1_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_14() {
+define void @s_shuffle_v2i64_v8i64__2_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27917,7 +27935,7 @@ define void @s_shuffle_v2i64_v8i64__2_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_14() {
+define void @s_shuffle_v2i64_v8i64__3_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27979,7 +27997,7 @@ define void @s_shuffle_v2i64_v8i64__3_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_14() {
+define void @s_shuffle_v2i64_v8i64__4_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27997,12 +28015,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
 ; GFX900-NEXT:    s_mov_b32 s14, s28
 ; GFX900-NEXT:    s_mov_b32 s15, s29
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28026,12 +28044,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s28
 ; GFX90A-NEXT:    s_mov_b32 s15, s29
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28061,7 +28079,7 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_14() {
+define void @s_shuffle_v2i64_v8i64__5_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28076,6 +28094,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:31]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s28
@@ -28084,7 +28103,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28105,6 +28123,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s28
@@ -28113,7 +28132,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -28144,7 +28162,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_14() {
+define void @s_shuffle_v2i64_v8i64__6_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28237,6 +28255,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -28244,12 +28263,12 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX942-NEXT:    s_mov_b32 s14, s28
 ; GFX942-NEXT:    s_mov_b32 s15, s29
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -28262,7 +28281,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_14() {
+define void @s_shuffle_v2i64_v8i64__7_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28352,22 +28371,22 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s8, s14
+; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
 ; GFX942-NEXT:    s_mov_b32 s10, s28
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b32 s11, s29
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -28380,7 +28399,7 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_14() {
+define void @s_shuffle_v2i64_v8i64__8_14() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_14:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28400,7 +28419,7 @@ define void @s_shuffle_v2i64_v8i64__8_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_14() {
+define void @s_shuffle_v2i64_v8i64__9_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28452,7 +28471,7 @@ define void @s_shuffle_v2i64_v8i64__9_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_14() {
+define void @s_shuffle_v2i64_v8i64__10_14() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_14:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28472,7 +28491,7 @@ define void @s_shuffle_v2i64_v8i64__10_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_14() {
+define void @s_shuffle_v2i64_v8i64__11_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28524,7 +28543,7 @@ define void @s_shuffle_v2i64_v8i64__11_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_14() {
+define void @s_shuffle_v2i64_v8i64__12_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28574,7 +28593,7 @@ define void @s_shuffle_v2i64_v8i64__12_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_14() {
+define void @s_shuffle_v2i64_v8i64__13_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28626,7 +28645,7 @@ define void @s_shuffle_v2i64_v8i64__13_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_14() {
+define void @s_shuffle_v2i64_v8i64__14_14() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_14:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28678,7 +28697,7 @@ define void @s_shuffle_v2i64_v8i64__14_14() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__u_15() {
+define void @s_shuffle_v2i64_v8i64__u_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28724,7 +28743,7 @@ define void @s_shuffle_v2i64_v8i64__u_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__0_15() {
+define void @s_shuffle_v2i64_v8i64__0_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28780,7 +28799,7 @@ define void @s_shuffle_v2i64_v8i64__0_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__1_15() {
+define void @s_shuffle_v2i64_v8i64__1_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28842,7 +28861,7 @@ define void @s_shuffle_v2i64_v8i64__1_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__2_15() {
+define void @s_shuffle_v2i64_v8i64__2_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28898,7 +28917,7 @@ define void @s_shuffle_v2i64_v8i64__2_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__3_15() {
+define void @s_shuffle_v2i64_v8i64__3_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28960,7 +28979,7 @@ define void @s_shuffle_v2i64_v8i64__3_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_15() {
+define void @s_shuffle_v2i64_v8i64__4_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28978,12 +28997,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
 ; GFX900-NEXT:    s_mov_b32 s14, s30
 ; GFX900-NEXT:    s_mov_b32 s15, s31
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29007,12 +29026,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
 ; GFX90A-NEXT:    s_mov_b32 s14, s30
 ; GFX90A-NEXT:    s_mov_b32 s15, s31
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29042,7 +29061,7 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__5_15() {
+define void @s_shuffle_v2i64_v8i64__5_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29061,11 +29080,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
 ; GFX900-NEXT:    s_mov_b32 s29, s15
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX900-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29090,11 +29109,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
 ; GFX90A-NEXT:    s_mov_b32 s29, s15
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX90A-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29126,7 +29145,7 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__6_15() {
+define void @s_shuffle_v2i64_v8i64__6_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29219,6 +29238,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
@@ -29226,12 +29246,12 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX942-NEXT:    s_mov_b32 s14, s30
 ; GFX942-NEXT:    s_mov_b32 s15, s31
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -29244,7 +29264,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__7_15() {
+define void @s_shuffle_v2i64_v8i64__7_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29334,22 +29354,23 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s28, s14
 ; GFX942-NEXT:    s_mov_b32 s29, s15
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
 ; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
@@ -29362,7 +29383,7 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__8_15() {
+define void @s_shuffle_v2i64_v8i64__8_15() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29382,7 +29403,7 @@ define void @s_shuffle_v2i64_v8i64__8_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__9_15() {
+define void @s_shuffle_v2i64_v8i64__9_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29434,7 +29455,7 @@ define void @s_shuffle_v2i64_v8i64__9_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__10_15() {
+define void @s_shuffle_v2i64_v8i64__10_15() #0 {
 ; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29454,7 +29475,7 @@ define void @s_shuffle_v2i64_v8i64__10_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__11_15() {
+define void @s_shuffle_v2i64_v8i64__11_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29506,7 +29527,7 @@ define void @s_shuffle_v2i64_v8i64__11_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__12_15() {
+define void @s_shuffle_v2i64_v8i64__12_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29556,7 +29577,7 @@ define void @s_shuffle_v2i64_v8i64__12_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__13_15() {
+define void @s_shuffle_v2i64_v8i64__13_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29608,7 +29629,7 @@ define void @s_shuffle_v2i64_v8i64__13_15() {
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__14_15() {
+define void @s_shuffle_v2i64_v8i64__14_15() #0 {
 ; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_15:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29653,5 +29674,8 @@ define void @s_shuffle_v2i64_v8i64__14_15() {
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
   ret void
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX90APLUS: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index 1ffef8e60d90d..ea67593d72761 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -24,10 +24,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-equal-sized-spills
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
@@ -89,10 +89,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-large-spill-first
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
@@ -152,10 +152,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-small-spill-first
-    ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+    ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: {{  $}}
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
-    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
     ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index ec940d9d0955f..d9d2a99c3e02d 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -242,8 +242,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -425,8 +425,8 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -469,11 +469,11 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3
 ; GCN-NEXT:    v_mov_b32_e32 v1, v40
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
 ; GCN-NEXT:    v_readlane_b32 s31, v42, 1
-; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s6, v42, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
@@ -603,23 +603,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FIJI-NEXT:    s_mov_b64 exec, s[18:19]
 ; FIJI-NEXT:    v_writelane_b32 v40, s16, 18
-; FIJI-NEXT:    v_writelane_b32 v40, s30, 0
-; FIJI-NEXT:    v_writelane_b32 v40, s31, 1
-; FIJI-NEXT:    v_writelane_b32 v40, s34, 2
-; FIJI-NEXT:    v_writelane_b32 v40, s35, 3
-; FIJI-NEXT:    v_writelane_b32 v40, s36, 4
-; FIJI-NEXT:    v_writelane_b32 v40, s37, 5
-; FIJI-NEXT:    v_writelane_b32 v40, s38, 6
-; FIJI-NEXT:    v_writelane_b32 v40, s39, 7
-; FIJI-NEXT:    v_writelane_b32 v40, s48, 8
-; FIJI-NEXT:    v_writelane_b32 v40, s49, 9
-; FIJI-NEXT:    v_writelane_b32 v40, s50, 10
-; FIJI-NEXT:    v_writelane_b32 v40, s51, 11
-; FIJI-NEXT:    v_writelane_b32 v40, s52, 12
-; FIJI-NEXT:    v_writelane_b32 v40, s53, 13
-; FIJI-NEXT:    v_writelane_b32 v40, s54, 14
-; FIJI-NEXT:    v_writelane_b32 v40, s55, 15
-; FIJI-NEXT:    v_writelane_b32 v40, s64, 16
+; FIJI-NEXT:    v_writelane_b32 v40, s34, 0
+; FIJI-NEXT:    v_writelane_b32 v40, s35, 1
+; FIJI-NEXT:    v_writelane_b32 v40, s36, 2
+; FIJI-NEXT:    v_writelane_b32 v40, s37, 3
+; FIJI-NEXT:    v_writelane_b32 v40, s38, 4
+; FIJI-NEXT:    v_writelane_b32 v40, s39, 5
+; FIJI-NEXT:    v_writelane_b32 v40, s48, 6
+; FIJI-NEXT:    v_writelane_b32 v40, s49, 7
+; FIJI-NEXT:    v_writelane_b32 v40, s50, 8
+; FIJI-NEXT:    v_writelane_b32 v40, s51, 9
+; FIJI-NEXT:    v_writelane_b32 v40, s52, 10
+; FIJI-NEXT:    v_writelane_b32 v40, s53, 11
+; FIJI-NEXT:    v_writelane_b32 v40, s54, 12
+; FIJI-NEXT:    v_writelane_b32 v40, s55, 13
+; FIJI-NEXT:    v_writelane_b32 v40, s64, 14
+; FIJI-NEXT:    v_writelane_b32 v40, s65, 15
+; FIJI-NEXT:    v_writelane_b32 v40, s30, 16
 ; FIJI-NEXT:    s_mov_b32 s50, s15
 ; FIJI-NEXT:    s_mov_b32 s51, s14
 ; FIJI-NEXT:    s_mov_b32 s52, s13
@@ -631,7 +631,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; FIJI-NEXT:    s_mov_b64 s[54:55], exec
 ; FIJI-NEXT:    s_addk_i32 s32, 0x400
-; FIJI-NEXT:    v_writelane_b32 v40, s65, 17
+; FIJI-NEXT:    v_writelane_b32 v40, s31, 17
 ; FIJI-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; FIJI-NEXT:    v_readfirstlane_b32 s16, v0
 ; FIJI-NEXT:    v_readfirstlane_b32 s17, v1
@@ -657,25 +657,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; FIJI-NEXT:    s_cbranch_execnz .LBB18_1
 ; FIJI-NEXT:  ; %bb.2:
 ; FIJI-NEXT:    s_mov_b64 exec, s[54:55]
+; FIJI-NEXT:    v_readlane_b32 s30, v40, 16
 ; FIJI-NEXT:    v_mov_b32_e32 v0, v4
-; FIJI-NEXT:    v_readlane_b32 s65, v40, 17
-; FIJI-NEXT:    v_readlane_b32 s64, v40, 16
-; FIJI-NEXT:    v_readlane_b32 s55, v40, 15
-; FIJI-NEXT:    v_readlane_b32 s54, v40, 14
-; FIJI-NEXT:    v_readlane_b32 s53, v40, 13
-; FIJI-NEXT:    v_readlane_b32 s52, v40, 12
-; FIJI-NEXT:    v_readlane_b32 s51, v40, 11
-; FIJI-NEXT:    v_readlane_b32 s50, v40, 10
-; FIJI-NEXT:    v_readlane_b32 s49, v40, 9
-; FIJI-NEXT:    v_readlane_b32 s48, v40, 8
-; FIJI-NEXT:    v_readlane_b32 s39, v40, 7
-; FIJI-NEXT:    v_readlane_b32 s38, v40, 6
-; FIJI-NEXT:    v_readlane_b32 s37, v40, 5
-; FIJI-NEXT:    v_readlane_b32 s36, v40, 4
-; FIJI-NEXT:    v_readlane_b32 s35, v40, 3
-; FIJI-NEXT:    v_readlane_b32 s34, v40, 2
-; FIJI-NEXT:    v_readlane_b32 s31, v40, 1
-; FIJI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIJI-NEXT:    v_readlane_b32 s31, v40, 17
+; FIJI-NEXT:    v_readlane_b32 s65, v40, 15
+; FIJI-NEXT:    v_readlane_b32 s64, v40, 14
+; FIJI-NEXT:    v_readlane_b32 s55, v40, 13
+; FIJI-NEXT:    v_readlane_b32 s54, v40, 12
+; FIJI-NEXT:    v_readlane_b32 s53, v40, 11
+; FIJI-NEXT:    v_readlane_b32 s52, v40, 10
+; FIJI-NEXT:    v_readlane_b32 s51, v40, 9
+; FIJI-NEXT:    v_readlane_b32 s50, v40, 8
+; FIJI-NEXT:    v_readlane_b32 s49, v40, 7
+; FIJI-NEXT:    v_readlane_b32 s48, v40, 6
+; FIJI-NEXT:    v_readlane_b32 s39, v40, 5
+; FIJI-NEXT:    v_readlane_b32 s38, v40, 4
+; FIJI-NEXT:    v_readlane_b32 s37, v40, 3
+; FIJI-NEXT:    v_readlane_b32 s36, v40, 2
+; FIJI-NEXT:    v_readlane_b32 s35, v40, 1
+; FIJI-NEXT:    v_readlane_b32 s34, v40, 0
 ; FIJI-NEXT:    s_mov_b32 s32, s33
 ; FIJI-NEXT:    v_readlane_b32 s4, v40, 18
 ; FIJI-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -694,23 +694,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; HAWAII-NEXT:    s_mov_b64 exec, s[18:19]
 ; HAWAII-NEXT:    v_writelane_b32 v40, s16, 18
-; HAWAII-NEXT:    v_writelane_b32 v40, s30, 0
-; HAWAII-NEXT:    v_writelane_b32 v40, s31, 1
-; HAWAII-NEXT:    v_writelane_b32 v40, s34, 2
-; HAWAII-NEXT:    v_writelane_b32 v40, s35, 3
-; HAWAII-NEXT:    v_writelane_b32 v40, s36, 4
-; HAWAII-NEXT:    v_writelane_b32 v40, s37, 5
-; HAWAII-NEXT:    v_writelane_b32 v40, s38, 6
-; HAWAII-NEXT:    v_writelane_b32 v40, s39, 7
-; HAWAII-NEXT:    v_writelane_b32 v40, s48, 8
-; HAWAII-NEXT:    v_writelane_b32 v40, s49, 9
-; HAWAII-NEXT:    v_writelane_b32 v40, s50, 10
-; HAWAII-NEXT:    v_writelane_b32 v40, s51, 11
-; HAWAII-NEXT:    v_writelane_b32 v40, s52, 12
-; HAWAII-NEXT:    v_writelane_b32 v40, s53, 13
-; HAWAII-NEXT:    v_writelane_b32 v40, s54, 14
-; HAWAII-NEXT:    v_writelane_b32 v40, s55, 15
-; HAWAII-NEXT:    v_writelane_b32 v40, s64, 16
+; HAWAII-NEXT:    v_writelane_b32 v40, s34, 0
+; HAWAII-NEXT:    v_writelane_b32 v40, s35, 1
+; HAWAII-NEXT:    v_writelane_b32 v40, s36, 2
+; HAWAII-NEXT:    v_writelane_b32 v40, s37, 3
+; HAWAII-NEXT:    v_writelane_b32 v40, s38, 4
+; HAWAII-NEXT:    v_writelane_b32 v40, s39, 5
+; HAWAII-NEXT:    v_writelane_b32 v40, s48, 6
+; HAWAII-NEXT:    v_writelane_b32 v40, s49, 7
+; HAWAII-NEXT:    v_writelane_b32 v40, s50, 8
+; HAWAII-NEXT:    v_writelane_b32 v40, s51, 9
+; HAWAII-NEXT:    v_writelane_b32 v40, s52, 10
+; HAWAII-NEXT:    v_writelane_b32 v40, s53, 11
+; HAWAII-NEXT:    v_writelane_b32 v40, s54, 12
+; HAWAII-NEXT:    v_writelane_b32 v40, s55, 13
+; HAWAII-NEXT:    v_writelane_b32 v40, s64, 14
+; HAWAII-NEXT:    v_writelane_b32 v40, s65, 15
+; HAWAII-NEXT:    v_writelane_b32 v40, s30, 16
 ; HAWAII-NEXT:    s_mov_b32 s50, s15
 ; HAWAII-NEXT:    s_mov_b32 s51, s14
 ; HAWAII-NEXT:    s_mov_b32 s52, s13
@@ -722,7 +722,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; HAWAII-NEXT:    s_mov_b64 s[54:55], exec
 ; HAWAII-NEXT:    s_addk_i32 s32, 0x400
-; HAWAII-NEXT:    v_writelane_b32 v40, s65, 17
+; HAWAII-NEXT:    v_writelane_b32 v40, s31, 17
 ; HAWAII-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; HAWAII-NEXT:    v_readfirstlane_b32 s16, v0
 ; HAWAII-NEXT:    v_readfirstlane_b32 s17, v1
@@ -748,25 +748,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; HAWAII-NEXT:    s_cbranch_execnz .LBB18_1
 ; HAWAII-NEXT:  ; %bb.2:
 ; HAWAII-NEXT:    s_mov_b64 exec, s[54:55]
+; HAWAII-NEXT:    v_readlane_b32 s30, v40, 16
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, v4
-; HAWAII-NEXT:    v_readlane_b32 s65, v40, 17
-; HAWAII-NEXT:    v_readlane_b32 s64, v40, 16
-; HAWAII-NEXT:    v_readlane_b32 s55, v40, 15
-; HAWAII-NEXT:    v_readlane_b32 s54, v40, 14
-; HAWAII-NEXT:    v_readlane_b32 s53, v40, 13
-; HAWAII-NEXT:    v_readlane_b32 s52, v40, 12
-; HAWAII-NEXT:    v_readlane_b32 s51, v40, 11
-; HAWAII-NEXT:    v_readlane_b32 s50, v40, 10
-; HAWAII-NEXT:    v_readlane_b32 s49, v40, 9
-; HAWAII-NEXT:    v_readlane_b32 s48, v40, 8
-; HAWAII-NEXT:    v_readlane_b32 s39, v40, 7
-; HAWAII-NEXT:    v_readlane_b32 s38, v40, 6
-; HAWAII-NEXT:    v_readlane_b32 s37, v40, 5
-; HAWAII-NEXT:    v_readlane_b32 s36, v40, 4
-; HAWAII-NEXT:    v_readlane_b32 s35, v40, 3
-; HAWAII-NEXT:    v_readlane_b32 s34, v40, 2
-; HAWAII-NEXT:    v_readlane_b32 s31, v40, 1
-; HAWAII-NEXT:    v_readlane_b32 s30, v40, 0
+; HAWAII-NEXT:    v_readlane_b32 s31, v40, 17
+; HAWAII-NEXT:    v_readlane_b32 s65, v40, 15
+; HAWAII-NEXT:    v_readlane_b32 s64, v40, 14
+; HAWAII-NEXT:    v_readlane_b32 s55, v40, 13
+; HAWAII-NEXT:    v_readlane_b32 s54, v40, 12
+; HAWAII-NEXT:    v_readlane_b32 s53, v40, 11
+; HAWAII-NEXT:    v_readlane_b32 s52, v40, 10
+; HAWAII-NEXT:    v_readlane_b32 s51, v40, 9
+; HAWAII-NEXT:    v_readlane_b32 s50, v40, 8
+; HAWAII-NEXT:    v_readlane_b32 s49, v40, 7
+; HAWAII-NEXT:    v_readlane_b32 s48, v40, 6
+; HAWAII-NEXT:    v_readlane_b32 s39, v40, 5
+; HAWAII-NEXT:    v_readlane_b32 s38, v40, 4
+; HAWAII-NEXT:    v_readlane_b32 s37, v40, 3
+; HAWAII-NEXT:    v_readlane_b32 s36, v40, 2
+; HAWAII-NEXT:    v_readlane_b32 s35, v40, 1
+; HAWAII-NEXT:    v_readlane_b32 s34, v40, 0
 ; HAWAII-NEXT:    s_mov_b32 s32, s33
 ; HAWAII-NEXT:    v_readlane_b32 s4, v40, 18
 ; HAWAII-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -785,23 +785,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 18
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s52, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s53, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s54, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s55, 15
-; GFX9-NEXT:    v_writelane_b32 v40, s64, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s64, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s65, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX9-NEXT:    s_mov_b32 s50, s15
 ; GFX9-NEXT:    s_mov_b32 s51, s14
 ; GFX9-NEXT:    s_mov_b32 s52, s13
@@ -813,7 +813,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    s_mov_b64 s[54:55], exec
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s65, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX9-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s17, v1
@@ -839,25 +839,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX9-NEXT:  ; %bb.2:
 ; GFX9-NEXT:    s_mov_b64 exec, s[54:55]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_readlane_b32 s65, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s64, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s55, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s54, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s53, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s52, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s65, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s64, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 18
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 5ef54268c9372..540737672ed15 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -311,8 +311,8 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    s_mov_b32 s32, s34
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 3
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index a4aa8d55106e8..3537dae64ffea 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -9,7 +9,7 @@
 declare ptr addrspace(5) @llvm.stacksave.p5()
 declare void @llvm.stackrestore.p5(ptr addrspace(5))
 
-define hidden void @stack_passed_argument([32 x i32], i32) {
+define hidden void @stack_passed_argument([32 x i32], i32) #0 {
 ; WAVE32-OPT-LABEL: stack_passed_argument:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46,7 +46,7 @@ define hidden void @stack_passed_argument([32 x i32], i32) {
   ret void
 }
 
-define void @func_store_stacksave() {
+define void @func_store_stacksave() #0 {
 ; WAVE32-OPT-LABEL: func_store_stacksave:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,7 +99,7 @@ define void @func_store_stacksave() {
   ret void
 }
 
-define amdgpu_kernel void @kernel_store_stacksave() {
+define amdgpu_kernel void @kernel_store_stacksave() #0 {
 ; WAVE32-OPT-LABEL: kernel_store_stacksave:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_lshr_b32 s0, s32, 5
@@ -147,7 +147,7 @@ define amdgpu_kernel void @kernel_store_stacksave() {
   ret void
 }
 
-define amdgpu_kernel void @kernel_store_stacksave_nocall() {
+define amdgpu_kernel void @kernel_store_stacksave_nocall() #0 {
 ; WAVE32-OPT-LABEL: kernel_store_stacksave_nocall:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_getpc_b64 s[12:13]
@@ -228,7 +228,7 @@ define amdgpu_kernel void @kernel_store_stacksave_nocall() {
   ret void
 }
 
-define void @func_stacksave_nonentry_block(i1 %cond) {
+define void @func_stacksave_nonentry_block(i1 %cond) #0 {
 ; WAVE32-OPT-LABEL: func_stacksave_nonentry_block:
 ; WAVE32-OPT:       ; %bb.0: ; %bb0
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,7 +375,7 @@ bb2:
   ret void
 }
 
-define void @func_stackrestore_poison() {
+define void @func_stackrestore_poison() #0 {
 ; WAVE32-OPT-LABEL: func_stackrestore_poison:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -415,7 +415,7 @@ define void @func_stackrestore_poison() {
   ret void
 }
 
-define void @func_stackrestore_null() {
+define void @func_stackrestore_null() #0 {
 ; WAVE32-OPT-LABEL: func_stackrestore_null:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -455,7 +455,7 @@ define void @func_stackrestore_null() {
   ret void
 }
 
-define void @func_stackrestore_neg1() {
+define void @func_stackrestore_neg1() #0 {
 ; WAVE32-OPT-LABEL: func_stackrestore_neg1:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -495,7 +495,7 @@ define void @func_stackrestore_neg1() {
   ret void
 }
 
-define void @func_stackrestore_42() {
+define void @func_stackrestore_42() #0 {
 ; WAVE32-OPT-LABEL: func_stackrestore_42:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -535,7 +535,7 @@ define void @func_stackrestore_42() {
   ret void
 }
 
-define void @func_stacksave_stackrestore() {
+define void @func_stacksave_stackrestore() #0 {
 ; WAVE32-OPT-LABEL: func_stacksave_stackrestore:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -571,7 +571,7 @@ define void @func_stacksave_stackrestore() {
   ret void
 }
 
-define void @func_stacksave_stackrestore_use() {
+define void @func_stacksave_stackrestore_use() #0 {
 ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_use:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -628,7 +628,7 @@ define void @func_stacksave_stackrestore_use() {
   ret void
 }
 
-define amdgpu_kernel void @kernel_stacksave_stackrestore_use() {
+define amdgpu_kernel void @kernel_stacksave_stackrestore_use() #0 {
 ; WAVE32-OPT-LABEL: kernel_stacksave_stackrestore_use:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_lshr_b32 s0, s32, 5
@@ -680,7 +680,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_use() {
   ret void
 }
 
-define void @func_stacksave_stackrestore_voffset(i32 %offset) {
+define void @func_stacksave_stackrestore_voffset(i32 %offset) #0 {
 ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_voffset:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -737,7 +737,7 @@ define void @func_stacksave_stackrestore_voffset(i32 %offset) {
   ret void
 }
 
-define void @func_stacksave_vgpr(ptr addrspace(5) %stack) {
+define void @func_stacksave_vgpr(ptr addrspace(5) %stack) #0 {
 ; WAVE32-OPT-LABEL: func_stacksave_vgpr:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -779,7 +779,7 @@ define void @func_stacksave_vgpr(ptr addrspace(5) %stack) {
   ret void
 }
 
-define amdgpu_gfx void @func_stacksave_sgpr(ptr addrspace(5) inreg %stack) {
+define amdgpu_gfx void @func_stacksave_sgpr(ptr addrspace(5) inreg %stack) #0 {
 ; WAVE32-OPT-LABEL: func_stacksave_sgpr:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -816,7 +816,7 @@ define amdgpu_gfx void @func_stacksave_sgpr(ptr addrspace(5) inreg %stack) {
   ret void
 }
 
-define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) {
+define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) #0 {
 ; WAVE32-OPT-LABEL: kernel_stacksave_sgpr:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_load_dword s0, s[4:5], 0x0
@@ -881,7 +881,7 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) {
   ret void
 }
 
-define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects() {
+define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects() #0 {
 ; WAVE32-OPT-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_getpc_b64 s[20:21]
@@ -1304,7 +1304,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
   ret void
 }
 
-define void @func_stacksave_stackrestore_call_with_stack_objects() {
+define void @func_stacksave_stackrestore_call_with_stack_objects() #0 {
 ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
 ; WAVE32-OPT:       ; %bb.0:
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1327,11 +1327,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-OPT-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; WAVE32-OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE32-OPT-NEXT:    s_mov_b32 s32, s18
+; WAVE32-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE32-OPT-NEXT:    ;;#ASMSTART
 ; WAVE32-OPT-NEXT:    ; use s19
 ; WAVE32-OPT-NEXT:    ;;#ASMEND
 ; WAVE32-OPT-NEXT:    v_readlane_b32 s31, v32, 1
-; WAVE32-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE32-OPT-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-OPT-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-OPT-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1362,11 +1362,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-OPT-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; WAVE64-OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE64-OPT-NEXT:    s_mov_b32 s32, s18
+; WAVE64-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE64-OPT-NEXT:    ;;#ASMSTART
 ; WAVE64-OPT-NEXT:    ; use s19
 ; WAVE64-OPT-NEXT:    ;;#ASMEND
 ; WAVE64-OPT-NEXT:    v_readlane_b32 s31, v32, 1
-; WAVE64-OPT-NEXT:    v_readlane_b32 s30, v32, 0
 ; WAVE64-OPT-NEXT:    s_mov_b32 s32, s33
 ; WAVE64-OPT-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-OPT-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1478,8 +1478,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-O0-NEXT:    ; use s5
 ; WAVE32-O0-NEXT:    ;;#ASMEND
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
-; WAVE32-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-O0-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE32-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1592,8 +1592,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-O0-NEXT:    ; use s5
 ; WAVE64-O0-NEXT:    ;;#ASMEND
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
-; WAVE64-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-O0-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE64-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s33
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1706,8 +1706,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s5
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1725,6 +1725,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
   ret void
 }
 
+attributes #0 = { nounwind }
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 034119b98790f..05ea168c9ec7c 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -175,8 +175,8 @@ define void @outgoing_f16_arg(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -207,8 +207,8 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -245,8 +245,8 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_short v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -283,8 +283,8 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_dword v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -324,8 +324,8 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_dword v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -371,8 +371,8 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    flat_store_dword v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -402,8 +402,8 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
index 5c6fcd4f977e3..13cde61ff16a0 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
@@ -18,11 +18,12 @@ define void @test_load_zext() #0 {
 ; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    s_mov_b32 s0, DescriptorBuffer at abs32@lo
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s0, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll
index 7ce14c919e865..8cd83979d26c0 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.waterfall.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
 
-define hidden void @void_func_i32_inreg(i32 inreg) {
+define hidden void @void_func_i32_inreg(i32 inreg) #0 {
 ; CHECK-LABEL: void_func_i32_inreg:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9,7 +9,7 @@ define hidden void @void_func_i32_inreg(i32 inreg) {
   ret void
 }
 
-define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
+define void @tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
 ; CHECK-LABEL: tail_call_i32_inreg_divergent:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36,8 +36,8 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
 ; CHECK-NEXT:  ; %bb.2:
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -51,7 +51,7 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
 
 @constant = external hidden addrspace(4) constant ptr
 
-define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
+define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) #0 {
 ; CHECK-LABEL: indirect_tail_call_i32_inreg_divergent:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -61,31 +61,31 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v40, s16, 20
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v40, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s49, 9
-; CHECK-NEXT:    v_writelane_b32 v40, s50, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s51, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s52, 12
-; CHECK-NEXT:    v_writelane_b32 v40, s53, 13
-; CHECK-NEXT:    v_writelane_b32 v40, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v40, s34, 0
+; CHECK-NEXT:    v_writelane_b32 v40, s35, 1
+; CHECK-NEXT:    v_writelane_b32 v40, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v40, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v40, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v40, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v40, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v40, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v40, s50, 8
+; CHECK-NEXT:    v_writelane_b32 v40, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v40, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v40, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v40, s54, 12
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s55, 15
-; CHECK-NEXT:    v_writelane_b32 v40, s64, 16
+; CHECK-NEXT:    v_writelane_b32 v40, s55, 13
+; CHECK-NEXT:    v_writelane_b32 v40, s64, 14
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, constant at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, constant at rel32@hi+12
-; CHECK-NEXT:    v_writelane_b32 v40, s65, 17
+; CHECK-NEXT:    v_writelane_b32 v40, s65, 15
 ; CHECK-NEXT:    s_load_dwordx2 s[64:65], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s66, 18
+; CHECK-NEXT:    v_writelane_b32 v40, s66, 16
+; CHECK-NEXT:    v_writelane_b32 v40, s67, 17
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 18
 ; CHECK-NEXT:    s_mov_b32 s50, s15
 ; CHECK-NEXT:    s_mov_b32 s51, s14
 ; CHECK-NEXT:    s_mov_b32 s52, s13
@@ -94,7 +94,7 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; CHECK-NEXT:    s_mov_b64 s[54:55], exec
-; CHECK-NEXT:    v_writelane_b32 v40, s67, 19
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 19
 ; CHECK-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_readfirstlane_b32 s16, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s16, v0
@@ -115,26 +115,26 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
 ; CHECK-NEXT:  ; %bb.2:
 ; CHECK-NEXT:    s_mov_b64 exec, s[54:55]
-; CHECK-NEXT:    v_readlane_b32 s67, v40, 19
-; CHECK-NEXT:    v_readlane_b32 s66, v40, 18
-; CHECK-NEXT:    v_readlane_b32 s65, v40, 17
-; CHECK-NEXT:    v_readlane_b32 s64, v40, 16
-; CHECK-NEXT:    v_readlane_b32 s55, v40, 15
-; CHECK-NEXT:    v_readlane_b32 s54, v40, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 18
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 19
+; CHECK-NEXT:    v_readlane_b32 s67, v40, 17
+; CHECK-NEXT:    v_readlane_b32 s66, v40, 16
+; CHECK-NEXT:    v_readlane_b32 s65, v40, 15
+; CHECK-NEXT:    v_readlane_b32 s64, v40, 14
+; CHECK-NEXT:    v_readlane_b32 s55, v40, 13
+; CHECK-NEXT:    v_readlane_b32 s54, v40, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v40, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v40, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v40, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v40, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v40, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v40, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v40, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v40, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v40, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v40, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 20
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index e78d62561238b..e5215fe1acdef 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -646,29 +646,30 @@ define i32 @s_in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
 ; GCN-NEXT:    s_or_saveexec_b32 s16, -1
 ; GCN-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b32 exec_lo, s16
-; GCN-NEXT:    v_writelane_b32 v40, s2, 4
 ; GCN-NEXT:    s_add_i32 s32, s32, 16
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    v_writelane_b32 v40, s2, 4
 ; GCN-NEXT:    s_load_b64 s[16:17], s[16:17], 0x0
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    s_mov_b32 s34, s1
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    s_and_b32 s35, s0, s3
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s35
+; GCN-NEXT:    v_writelane_b32 v40, s30, 2
+; GCN-NEXT:    v_writelane_b32 v40, s31, 3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_xor_b32 s0, s35, s34
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_readlane_b32 s30, v40, 2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 3
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s0, v40, 4
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
@@ -702,20 +703,21 @@ define i32 @s_in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
 ; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    s_mov_b32 s34, s1
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    s_and_b32 s35, s0, s3
+; GCN-NEXT:    v_writelane_b32 v40, s30, 2
+; GCN-NEXT:    v_writelane_b32 v40, s31, 3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    s_xor_b32 s0, s35, s34
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_readlane_b32 s30, v40, 2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s31, v40, 3
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s0, v40, 4
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index ef110379efc2e..781de08ea4496 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amdhsa -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: opt -S -si-annotate-control-flow -mtriple=amdgcn-amdhsa -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI-OPT %s
 
-define hidden void @widget() {
+define hidden void @widget() #0 {
 ; GCN-LABEL: widget:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14,22 +14,22 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_writelane_b32 v41, s16, 16
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v41, s30, 0
-; GCN-NEXT:    v_writelane_b32 v41, s31, 1
-; GCN-NEXT:    v_writelane_b32 v41, s34, 2
-; GCN-NEXT:    v_writelane_b32 v41, s35, 3
-; GCN-NEXT:    v_writelane_b32 v41, s36, 4
-; GCN-NEXT:    v_writelane_b32 v41, s37, 5
-; GCN-NEXT:    v_writelane_b32 v41, s38, 6
-; GCN-NEXT:    v_writelane_b32 v41, s39, 7
-; GCN-NEXT:    v_writelane_b32 v41, s48, 8
-; GCN-NEXT:    v_writelane_b32 v41, s49, 9
-; GCN-NEXT:    v_writelane_b32 v41, s50, 10
-; GCN-NEXT:    v_writelane_b32 v41, s51, 11
-; GCN-NEXT:    v_writelane_b32 v41, s52, 12
-; GCN-NEXT:    v_writelane_b32 v41, s53, 13
-; GCN-NEXT:    v_writelane_b32 v41, s54, 14
-; GCN-NEXT:    v_writelane_b32 v41, s55, 15
+; GCN-NEXT:    v_writelane_b32 v41, s34, 0
+; GCN-NEXT:    v_writelane_b32 v41, s35, 1
+; GCN-NEXT:    v_writelane_b32 v41, s36, 2
+; GCN-NEXT:    v_writelane_b32 v41, s37, 3
+; GCN-NEXT:    v_writelane_b32 v41, s38, 4
+; GCN-NEXT:    v_writelane_b32 v41, s39, 5
+; GCN-NEXT:    v_writelane_b32 v41, s48, 6
+; GCN-NEXT:    v_writelane_b32 v41, s49, 7
+; GCN-NEXT:    v_writelane_b32 v41, s50, 8
+; GCN-NEXT:    v_writelane_b32 v41, s51, 9
+; GCN-NEXT:    v_writelane_b32 v41, s52, 10
+; GCN-NEXT:    v_writelane_b32 v41, s53, 11
+; GCN-NEXT:    v_writelane_b32 v41, s54, 12
+; GCN-NEXT:    v_writelane_b32 v41, s55, 13
+; GCN-NEXT:    v_writelane_b32 v41, s30, 14
+; GCN-NEXT:    v_writelane_b32 v41, s31, 15
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
@@ -93,22 +93,22 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
-; GCN-NEXT:    v_readlane_b32 s55, v41, 15
-; GCN-NEXT:    v_readlane_b32 s54, v41, 14
-; GCN-NEXT:    v_readlane_b32 s53, v41, 13
-; GCN-NEXT:    v_readlane_b32 s52, v41, 12
-; GCN-NEXT:    v_readlane_b32 s51, v41, 11
-; GCN-NEXT:    v_readlane_b32 s50, v41, 10
-; GCN-NEXT:    v_readlane_b32 s49, v41, 9
-; GCN-NEXT:    v_readlane_b32 s48, v41, 8
-; GCN-NEXT:    v_readlane_b32 s39, v41, 7
-; GCN-NEXT:    v_readlane_b32 s38, v41, 6
-; GCN-NEXT:    v_readlane_b32 s37, v41, 5
-; GCN-NEXT:    v_readlane_b32 s36, v41, 4
-; GCN-NEXT:    v_readlane_b32 s35, v41, 3
-; GCN-NEXT:    v_readlane_b32 s34, v41, 2
-; GCN-NEXT:    v_readlane_b32 s31, v41, 1
-; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    v_readlane_b32 s30, v41, 14
+; GCN-NEXT:    v_readlane_b32 s31, v41, 15
+; GCN-NEXT:    v_readlane_b32 s55, v41, 13
+; GCN-NEXT:    v_readlane_b32 s54, v41, 12
+; GCN-NEXT:    v_readlane_b32 s53, v41, 11
+; GCN-NEXT:    v_readlane_b32 s52, v41, 10
+; GCN-NEXT:    v_readlane_b32 s51, v41, 9
+; GCN-NEXT:    v_readlane_b32 s50, v41, 8
+; GCN-NEXT:    v_readlane_b32 s49, v41, 7
+; GCN-NEXT:    v_readlane_b32 s48, v41, 6
+; GCN-NEXT:    v_readlane_b32 s39, v41, 5
+; GCN-NEXT:    v_readlane_b32 s38, v41, 4
+; GCN-NEXT:    v_readlane_b32 s37, v41, 3
+; GCN-NEXT:    v_readlane_b32 s36, v41, 2
+; GCN-NEXT:    v_readlane_b32 s35, v41, 1
+; GCN-NEXT:    v_readlane_b32 s34, v41, 0
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v41, 16
@@ -191,7 +191,7 @@ bb12:                                             ; preds = %bb9, %bb2
 declare hidden float @wibble() local_unnamed_addr
 
 
-define hidden void @blam() {
+define hidden void @blam() #0 {
 ; SI-OPT-LABEL: @blam(
 ; SI-OPT-NEXT:  bb:
 ; SI-OPT-NEXT:    [[TMP:%.*]] = load float, ptr null, align 16
@@ -266,32 +266,32 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v45, s30, 0
-; GCN-NEXT:    v_writelane_b32 v45, s31, 1
-; GCN-NEXT:    v_writelane_b32 v45, s34, 2
-; GCN-NEXT:    v_writelane_b32 v45, s35, 3
-; GCN-NEXT:    v_writelane_b32 v45, s36, 4
-; GCN-NEXT:    v_writelane_b32 v45, s37, 5
-; GCN-NEXT:    v_writelane_b32 v45, s38, 6
-; GCN-NEXT:    v_writelane_b32 v45, s39, 7
-; GCN-NEXT:    v_writelane_b32 v45, s48, 8
-; GCN-NEXT:    v_writelane_b32 v45, s49, 9
-; GCN-NEXT:    v_writelane_b32 v45, s50, 10
-; GCN-NEXT:    v_writelane_b32 v45, s51, 11
-; GCN-NEXT:    v_writelane_b32 v45, s52, 12
-; GCN-NEXT:    v_writelane_b32 v45, s53, 13
-; GCN-NEXT:    v_writelane_b32 v45, s54, 14
-; GCN-NEXT:    v_writelane_b32 v45, s55, 15
-; GCN-NEXT:    v_writelane_b32 v45, s64, 16
-; GCN-NEXT:    v_writelane_b32 v45, s65, 17
-; GCN-NEXT:    v_writelane_b32 v45, s66, 18
-; GCN-NEXT:    v_writelane_b32 v45, s67, 19
-; GCN-NEXT:    v_writelane_b32 v45, s68, 20
-; GCN-NEXT:    v_writelane_b32 v45, s69, 21
-; GCN-NEXT:    v_writelane_b32 v45, s70, 22
-; GCN-NEXT:    v_writelane_b32 v45, s71, 23
-; GCN-NEXT:    v_writelane_b32 v45, s80, 24
-; GCN-NEXT:    v_writelane_b32 v45, s81, 25
+; GCN-NEXT:    v_writelane_b32 v45, s34, 0
+; GCN-NEXT:    v_writelane_b32 v45, s35, 1
+; GCN-NEXT:    v_writelane_b32 v45, s36, 2
+; GCN-NEXT:    v_writelane_b32 v45, s37, 3
+; GCN-NEXT:    v_writelane_b32 v45, s38, 4
+; GCN-NEXT:    v_writelane_b32 v45, s39, 5
+; GCN-NEXT:    v_writelane_b32 v45, s48, 6
+; GCN-NEXT:    v_writelane_b32 v45, s49, 7
+; GCN-NEXT:    v_writelane_b32 v45, s50, 8
+; GCN-NEXT:    v_writelane_b32 v45, s51, 9
+; GCN-NEXT:    v_writelane_b32 v45, s52, 10
+; GCN-NEXT:    v_writelane_b32 v45, s53, 11
+; GCN-NEXT:    v_writelane_b32 v45, s54, 12
+; GCN-NEXT:    v_writelane_b32 v45, s55, 13
+; GCN-NEXT:    v_writelane_b32 v45, s64, 14
+; GCN-NEXT:    v_writelane_b32 v45, s65, 15
+; GCN-NEXT:    v_writelane_b32 v45, s66, 16
+; GCN-NEXT:    v_writelane_b32 v45, s67, 17
+; GCN-NEXT:    v_writelane_b32 v45, s68, 18
+; GCN-NEXT:    v_writelane_b32 v45, s69, 19
+; GCN-NEXT:    v_writelane_b32 v45, s70, 20
+; GCN-NEXT:    v_writelane_b32 v45, s71, 21
+; GCN-NEXT:    v_writelane_b32 v45, s80, 22
+; GCN-NEXT:    v_writelane_b32 v45, s81, 23
+; GCN-NEXT:    v_writelane_b32 v45, s30, 24
+; GCN-NEXT:    v_writelane_b32 v45, s31, 25
 ; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_mov_b32 s54, s15
 ; GCN-NEXT:    s_mov_b32 s55, s14
@@ -427,32 +427,32 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_branch .LBB1_1
 ; GCN-NEXT:  .LBB1_18: ; %DummyReturnBlock
 ; GCN-NEXT:    s_or_b64 exec, exec, s[66:67]
-; GCN-NEXT:    v_readlane_b32 s81, v45, 25
-; GCN-NEXT:    v_readlane_b32 s80, v45, 24
-; GCN-NEXT:    v_readlane_b32 s71, v45, 23
-; GCN-NEXT:    v_readlane_b32 s70, v45, 22
-; GCN-NEXT:    v_readlane_b32 s69, v45, 21
-; GCN-NEXT:    v_readlane_b32 s68, v45, 20
-; GCN-NEXT:    v_readlane_b32 s67, v45, 19
-; GCN-NEXT:    v_readlane_b32 s66, v45, 18
-; GCN-NEXT:    v_readlane_b32 s65, v45, 17
-; GCN-NEXT:    v_readlane_b32 s64, v45, 16
-; GCN-NEXT:    v_readlane_b32 s55, v45, 15
-; GCN-NEXT:    v_readlane_b32 s54, v45, 14
-; GCN-NEXT:    v_readlane_b32 s53, v45, 13
-; GCN-NEXT:    v_readlane_b32 s52, v45, 12
-; GCN-NEXT:    v_readlane_b32 s51, v45, 11
-; GCN-NEXT:    v_readlane_b32 s50, v45, 10
-; GCN-NEXT:    v_readlane_b32 s49, v45, 9
-; GCN-NEXT:    v_readlane_b32 s48, v45, 8
-; GCN-NEXT:    v_readlane_b32 s39, v45, 7
-; GCN-NEXT:    v_readlane_b32 s38, v45, 6
-; GCN-NEXT:    v_readlane_b32 s37, v45, 5
-; GCN-NEXT:    v_readlane_b32 s36, v45, 4
-; GCN-NEXT:    v_readlane_b32 s35, v45, 3
-; GCN-NEXT:    v_readlane_b32 s34, v45, 2
-; GCN-NEXT:    v_readlane_b32 s31, v45, 1
-; GCN-NEXT:    v_readlane_b32 s30, v45, 0
+; GCN-NEXT:    v_readlane_b32 s30, v45, 24
+; GCN-NEXT:    v_readlane_b32 s31, v45, 25
+; GCN-NEXT:    v_readlane_b32 s81, v45, 23
+; GCN-NEXT:    v_readlane_b32 s80, v45, 22
+; GCN-NEXT:    v_readlane_b32 s71, v45, 21
+; GCN-NEXT:    v_readlane_b32 s70, v45, 20
+; GCN-NEXT:    v_readlane_b32 s69, v45, 19
+; GCN-NEXT:    v_readlane_b32 s68, v45, 18
+; GCN-NEXT:    v_readlane_b32 s67, v45, 17
+; GCN-NEXT:    v_readlane_b32 s66, v45, 16
+; GCN-NEXT:    v_readlane_b32 s65, v45, 15
+; GCN-NEXT:    v_readlane_b32 s64, v45, 14
+; GCN-NEXT:    v_readlane_b32 s55, v45, 13
+; GCN-NEXT:    v_readlane_b32 s54, v45, 12
+; GCN-NEXT:    v_readlane_b32 s53, v45, 11
+; GCN-NEXT:    v_readlane_b32 s52, v45, 10
+; GCN-NEXT:    v_readlane_b32 s51, v45, 9
+; GCN-NEXT:    v_readlane_b32 s50, v45, 8
+; GCN-NEXT:    v_readlane_b32 s49, v45, 7
+; GCN-NEXT:    v_readlane_b32 s48, v45, 6
+; GCN-NEXT:    v_readlane_b32 s39, v45, 5
+; GCN-NEXT:    v_readlane_b32 s38, v45, 4
+; GCN-NEXT:    v_readlane_b32 s37, v45, 3
+; GCN-NEXT:    v_readlane_b32 s36, v45, 2
+; GCN-NEXT:    v_readlane_b32 s35, v45, 1
+; GCN-NEXT:    v_readlane_b32 s34, v45, 0
 ; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -522,3 +522,4 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 
 declare hidden float @spam()
 
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index fbacb53f2858e..573cdedb523d5 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=CHECK %s
 
-define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
+define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" nounwind {
 ; CHECK-LABEL: max_6_vgprs:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off scope:SCOPE_SYS
@@ -68,7 +68,7 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
   ret void
 }
 
-define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgpu-num-vgpr"="11" {
+define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgpu-num-vgpr"="11" nounwind {
 ; CHECK-LABEL: max_11_vgprs_branch:
 ; CHECK:       ; %bb.0: ; %.entry
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off scope:SCOPE_SYS
@@ -390,7 +390,7 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    global_load_b128 v[18:21], v[8:9], off offset:80
 ; CHECK-NEXT:    global_load_b128 v[22:25], v[8:9], off offset:96
 ; CHECK-NEXT:    global_load_b128 v[26:29], v[8:9], off offset:112
-; CHECK-NEXT:    v_writelane_b32 v93, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v93, s34, 0
 ; CHECK-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; CHECK-NEXT:    s_wait_loadcnt 0x4
 ; CHECK-NEXT:    scratch_store_b128 off, v[10:13], s33 offset:276 ; 16-byte Folded Spill
@@ -406,27 +406,27 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    scratch_store_b128 off, v[30:33], s33 offset:356
 ; CHECK-NEXT:    scratch_store_b128 off, v[34:37], s33 offset:372
 ; CHECK-NEXT:    scratch_store_b128 off, v[38:41], s33 offset:388
-; CHECK-NEXT:    v_writelane_b32 v93, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v93, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v93, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v93, s35, 1
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    v_writelane_b32 v93, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v93, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v93, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v93, s37, 3
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    v_writelane_b32 v93, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v93, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v93, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v93, s39, 5
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_writelane_b32 v93, s48, 8
-; CHECK-NEXT:    v_writelane_b32 v93, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v93, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v93, s49, 7
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v93, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v93, s50, 8
 ; CHECK-NEXT:    s_mov_b32 s50, s15
-; CHECK-NEXT:    v_writelane_b32 v93, s51, 11
+; CHECK-NEXT:    v_writelane_b32 v93, s51, 9
 ; CHECK-NEXT:    s_mov_b32 s51, s14
-; CHECK-NEXT:    v_writelane_b32 v93, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v93, s52, 10
 ; CHECK-NEXT:    s_mov_b32 s52, s13
-; CHECK-NEXT:    v_writelane_b32 v93, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v93, s53, 11
 ; CHECK-NEXT:    s_mov_b32 s53, s12
+; CHECK-NEXT:    v_writelane_b32 v93, s30, 12
+; CHECK-NEXT:    v_writelane_b32 v93, s31, 13
 ; CHECK-NEXT:    s_wait_kmcnt 0x0
 ; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -551,20 +551,20 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    scratch_load_b32 v42, off, s33 offset:136
 ; CHECK-NEXT:    scratch_load_b32 v41, off, s33 offset:140
 ; CHECK-NEXT:    scratch_load_b32 v40, off, s33 offset:144
-; CHECK-NEXT:    v_readlane_b32 s53, v93, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v93, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v93, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v93, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v93, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v93, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v93, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v93, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v93, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v93, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v93, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v93, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v93, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v93, 0
+; CHECK-NEXT:    v_readlane_b32 s30, v93, 12
+; CHECK-NEXT:    v_readlane_b32 s31, v93, 13
+; CHECK-NEXT:    v_readlane_b32 s53, v93, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v93, 10
+; CHECK-NEXT:    v_readlane_b32 s51, v93, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v93, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v93, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v93, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v93, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v93, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v93, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v93, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v93, 1
+; CHECK-NEXT:    v_readlane_b32 s34, v93, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s0, v93, 14
 ; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index ff1475758382f..580ef1522ee14 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -52,8 +52,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -109,8 +109,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
@@ -163,8 +163,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
@@ -236,8 +236,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -286,8 +286,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
@@ -335,8 +335,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
-; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v45, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
index 93d864246d68d..0a2f4af37551d 100644
--- a/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
@@ -4,7 +4,7 @@
 ; Make sure SIFixSGPRCopies handles situations where it needs to fix
 ; up copies to physical registers from an AV virtual register.
 
-define i32 @fix_sgpr_copies_indirect_call(ptr addrspace(5) %ptr) {
+define i32 @fix_sgpr_copies_indirect_call(ptr addrspace(5) %ptr) #0 {
 ; CHECK-LABEL: fix_sgpr_copies_indirect_call:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -118,8 +118,8 @@ define i32 @fix_sgpr_copies_indirect_call(ptr addrspace(5) %ptr) {
 ; CHECK-NEXT:    v_readlane_b32 s5, v41, 13
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 4
 ; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
@@ -139,3 +139,5 @@ bb1:                                              ; preds = %bb
   tail call void %i()
   ret i32 0
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 65ae8bc6f0e58..2e29d7f215686 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -3085,8 +3085,8 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1032-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1032-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1032-NEXT:    s_mov_b32 s32, s33
 ; GFX1032-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX1032-NEXT:    s_or_saveexec_b32 s5, -1
@@ -3116,8 +3116,8 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1064-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1064-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1064-NEXT:    s_mov_b32 s32, s33
 ; GFX1064-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index 352d9258c2c93..fa9b09ea73c93 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -1593,8 +1593,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; DAGISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -1929,8 +1929,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; GISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; GISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -2266,8 +2266,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; DAGISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; DAGISEL64-NEXT:    v_readlane_b32 s5, v40, 1
 ; DAGISEL64-NEXT:    v_readlane_b32 s4, v40, 0
 ; DAGISEL64-NEXT:    v_readlane_b32 s0, v40, 4
@@ -2604,8 +2604,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; GISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT:    v_readlane_b32 s31, v40, 3
 ; GISEL64-NEXT:    v_readlane_b32 s5, v40, 1
 ; GISEL64-NEXT:    v_readlane_b32 s4, v40, 0
 ; GISEL64-NEXT:    v_readlane_b32 s0, v40, 4
@@ -3719,8 +3719,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX1250-DAGISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
 ; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 nv ; 4-byte Folded Reload
@@ -8048,9 +8048,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    v_writelane_b32 v42, s31, 2
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; DAGISEL-NEXT:    flat_store_b32 v[40:41], v0
 ; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 2
-; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
@@ -8389,9 +8390,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    v_writelane_b32 v42, s31, 2
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GISEL-NEXT:    flat_store_b32 v[40:41], v0
 ; GISEL-NEXT:    v_readlane_b32 s31, v42, 2
-; GISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; GISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
@@ -8732,9 +8734,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    v_writelane_b32 v42, s31, 3
 ; DAGISEL64-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_readlane_b32 s30, v42, 2
 ; DAGISEL64-NEXT:    flat_store_b32 v[40:41], v0
 ; DAGISEL64-NEXT:    v_readlane_b32 s31, v42, 3
-; DAGISEL64-NEXT:    v_readlane_b32 s30, v42, 2
 ; DAGISEL64-NEXT:    v_readlane_b32 s5, v42, 1
 ; DAGISEL64-NEXT:    v_readlane_b32 s4, v42, 0
 ; DAGISEL64-NEXT:    v_readlane_b32 s0, v42, 4
@@ -9076,9 +9079,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    v_writelane_b32 v42, s31, 3
 ; GISEL64-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_readlane_b32 s30, v42, 2
 ; GISEL64-NEXT:    flat_store_b32 v[40:41], v0
 ; GISEL64-NEXT:    v_readlane_b32 s31, v42, 3
-; GISEL64-NEXT:    v_readlane_b32 s30, v42, 2
 ; GISEL64-NEXT:    v_readlane_b32 s5, v42, 1
 ; GISEL64-NEXT:    v_readlane_b32 s4, v42, 0
 ; GISEL64-NEXT:    v_readlane_b32 s0, v42, 4
@@ -10197,9 +10201,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v42, s30, 1
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v42, s31, 2
 ; GFX1250-DAGISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GFX1250-DAGISEL-NEXT:    flat_store_b32 v[40:41], v0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s31, v42, 2
-; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s0, v42, 3
 ; GFX1250-DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index 06c451869e841..4011d0990d5ab 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -41,12 +41,12 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a32
 ; GFX90A-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX90A-NEXT:    v_readlane_b32 s20, v39, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s20
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX90A-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-NEXT:    v_readlane_b32 s4, v40, 4
 ; GFX90A-NEXT:    v_readlane_b32 s28, v40, 2
@@ -68,4 +68,4 @@ define void @vector_reg_liverange_split() #0 {
 
 declare void @foo()
 
-attributes #0 = { "amdgpu-num-vgpr"="41" "amdgpu-num-sgpr"="34"}
+attributes #0 = { "amdgpu-num-vgpr"="41" "amdgpu-num-sgpr"="34" nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index ff33cca0702ae..5009f0249df6d 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -49,10 +49,10 @@ define void @test() #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v39, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s28, v40, 2
@@ -111,8 +111,8 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-O0-NEXT:    global_store_dword v[0:1], v2, off
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    s_mov_b32 s32, s33
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-O0-NEXT:    v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 6aacbce89c986..3c8ac9f58b63e 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX9-O0 %s
 ; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX9-O3 %s
 
-define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
+define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) #1 {
 ; GFX9-O0-LABEL: strict_wwm_no_cfg:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -129,7 +129,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
   ret void
 }
 
-define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
+define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) #1 {
 ; GFX9-O0-LABEL: strict_wwm_cfg:
 ; GFX9-O0:       ; %bb.0: ; %entry
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -339,7 +339,7 @@ define hidden amdgpu_gfx i32 @strict_wwm_called(i32 %a) noinline {
   ret i32 %sub
 }
 
-define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
+define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) #1 {
 ; GFX9-O0-LABEL: strict_wwm_call:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -387,8 +387,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -424,9 +424,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -521,7 +521,7 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
   ret i64 %sub
 }
 
-define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
+define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) #1 {
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -621,8 +621,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -682,9 +682,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -707,7 +707,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
   ret void
 }
 
-define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
+define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) #1 {
 ; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1244,3 +1244,4 @@ declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
 
 attributes #0 = { "amdgpu-waves-per-eu"="5,5" }
+attributes #1 = { nounwind }



More information about the llvm-branch-commits mailing list