[llvm-branch-commits] [llvm] [amdgpu-cfi: 6/9]: [AMDGPU] Use register pair for PC spill (PR #183146)
Scott Linder via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Feb 26 13:53:19 PST 2026
https://github.com/slinder1 updated https://github.com/llvm/llvm-project/pull/183146
>From a4fdcf14ced1d0bfa44d198181cddc58ccbed726 Mon Sep 17 00:00:00 2001
From: Scott Linder <Scott.Linder at amd.com>
Date: Wed, 29 Oct 2025 18:46:12 +0000
Subject: [PATCH] [AMDGPU] Use register pair for PC spill
Change-Id: Ibedeef926f7ff235a06de65a83087c151f66a416
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 21 +
.../CodeGen/AMDGPU/GlobalISel/assert-align.ll | 2 +-
.../GlobalISel/call-outgoing-stack-args.ll | 8 +-
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 2 +-
.../abi-attribute-hints-undefined-behavior.ll | 2 +-
.../CodeGen/AMDGPU/amdgcn-call-whole-wave.ll | 8 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 5124 ++++++++---------
.../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 226 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll | 51 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll | 26 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll | 123 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll | 51 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 226 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll | 51 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 266 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 170 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll | 51 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll | 123 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll | 122 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll | 26 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 2550 ++++----
.../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 186 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 218 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 226 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 242 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 330 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 410 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 554 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 730 +--
.../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 122 +-
.../test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll | 4 +-
.../amdgpu-simplify-libcall-pow-codegen.ll | 285 +-
...tor-flatscratchinit-undefined-behavior2.ll | 13 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 998 +---
.../test/CodeGen/AMDGPU/branch-relax-spill.ll | 156 +-
.../CodeGen/AMDGPU/call-args-inreg-bfloat.ll | 8 +-
...l-args-inreg-no-sgpr-for-csrspill-xfail.ll | 6 +-
llvm/test/CodeGen/AMDGPU/call-args-inreg.ll | 88 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 40 +-
.../AMDGPU/call-graph-register-usage.ll | 2 +-
.../AMDGPU/call-preserved-registers.ll | 116 +-
llvm/test/CodeGen/AMDGPU/call-skip.ll | 6 +-
.../test/CodeGen/AMDGPU/callee-frame-setup.ll | 106 +-
.../callee-special-input-vgprs-packed.ll | 14 +-
.../AMDGPU/callee-special-input-vgprs.ll | 14 +-
llvm/test/CodeGen/AMDGPU/cc-entry.ll | 2 +-
.../AMDGPU/cc-inreg-sgpr0-3-mismatch.ll | 4 +-
.../AMDGPU/copysign-simplify-demanded-bits.ll | 56 +-
.../AMDGPU/cross-block-use-is-not-abi-copy.ll | 8 +-
llvm/test/CodeGen/AMDGPU/debug-frame.ll | 8 +-
.../AMDGPU/dwarf-multi-register-use-crash.ll | 64 +-
.../dynamic-vgpr-reserve-stack-for-cwsr.ll | 4 +-
.../AMDGPU/eliminate-frame-index-select.ll | 60 +-
.../fix-frame-reg-in-custom-csr-spills.ll | 2 +-
...frame-setup-without-sgpr-to-vgpr-spills.ll | 25 +-
.../CodeGen/AMDGPU/function-args-inreg.ll | 8 +-
.../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll | 144 +-
.../AMDGPU/gfx-callable-argument-types.ll | 1226 ++--
.../gfx-callable-preserved-registers.ll | 72 +-
.../AMDGPU/gfx-callable-return-types.ll | 42 +-
llvm/test/CodeGen/AMDGPU/global-alias.ll | 2 +-
.../identical-subrange-spill-infloop.ll | 96 +-
llvm/test/CodeGen/AMDGPU/indirect-call.ll | 1104 ++--
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 2 +-
.../CodeGen/AMDGPU/insert-waitcnts-crash.ll | 12 +-
llvm/test/CodeGen/AMDGPU/issue176578.ll | 60 +-
.../AMDGPU/materialize-frame-index-sgpr.ll | 1634 +++---
.../CodeGen/AMDGPU/mul24-pass-ordering.ll | 20 +-
.../AMDGPU/need-fp-from-vgpr-spills.ll | 6 +-
llvm/test/CodeGen/AMDGPU/nested-calls.ll | 4 +-
.../AMDGPU/no-source-locations-in-prologue.ll | 2 +-
llvm/test/CodeGen/AMDGPU/nofpclass-call.ll | 12 +-
.../AMDGPU/preserve-wwm-copy-dst-reg.ll | 25 +-
llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 6 +-
.../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir | 190 +-
.../AMDGPU/sgpr-spills-split-regalloc.ll | 27 +-
.../AMDGPU/shufflevector.v2i64.v8i64.ll | 299 +-
.../si-lower-sgpr-spills-vgpr-lanes-usage.mir | 18 +-
llvm/test/CodeGen/AMDGPU/sibling-call.ll | 222 +-
llvm/test/CodeGen/AMDGPU/stack-realign.ll | 2 +-
.../CodeGen/AMDGPU/stacksave_stackrestore.ll | 10 +-
.../AMDGPU/strictfp_f16_abi_promote.ll | 14 +-
.../CodeGen/AMDGPU/swdev504645-global-fold.ll | 3 +-
...unfold-masked-merge-scalar-variablemask.ll | 38 +-
.../AMDGPU/unstructured-cfg-def-use-issue.ll | 168 +-
.../CodeGen/AMDGPU/vgpr-tuple-allocation.ll | 12 +-
...terfall-call-target-av-register-failure.ll | 2 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 4 +-
.../CodeGen/AMDGPU/whole-wave-functions.ll | 25 +-
.../AMDGPU/whole-wave-register-copy.ll | 2 +-
.../AMDGPU/whole-wave-register-spill.ll | 4 +-
.../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 8 +-
92 files changed, 9565 insertions(+), 10296 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index cbd08f0fb5dff..a92876c624aee 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -267,11 +267,20 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
std::vector<CalleeSavedInfo> CSI;
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ MCRegister RetAddrReg = TRI->getReturnAddressReg(MF);
+ MCRegister RetAddrRegSub0 = TRI->getSubReg(RetAddrReg, AMDGPU::sub0);
+ MCRegister RetAddrRegSub1 = TRI->getSubReg(RetAddrReg, AMDGPU::sub1);
+ bool SpillRetAddrReg = false;
for (unsigned I = 0; CSRegs[I]; ++I) {
MCRegister Reg = CSRegs[I];
if (SavedRegs.test(Reg)) {
+ if (Reg == RetAddrRegSub0 || Reg == RetAddrRegSub1) {
+ SpillRetAddrReg = true;
+ continue;
+ }
+
const TargetRegisterClass *RC =
TRI->getMinimalPhysRegClass(Reg, MVT::i32);
int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
@@ -282,6 +291,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
}
}
+ // Return address uses a register pair. Add the super register to the
+ // CSI list so that it's easier to identify the entire spill and CFI
+ // can be emitted appropriately.
+ if (SpillRetAddrReg) {
+ const TargetRegisterClass *RC =
+ TRI->getMinimalPhysRegClass(RetAddrReg, MVT::i64);
+ int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
+ TRI->getSpillAlign(*RC), true);
+ CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI));
+ CalleeSavedFIs.push_back(JunkFI);
+ }
+
if (!CSI.empty()) {
for (MachineBasicBlock *SaveBlock : SaveBlocks)
insertCSRSaves(*SaveBlock, CSI, Indexes, LIS);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index 62fe5f101b458..bd808190f6eb2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -23,10 +23,10 @@ define ptr addrspace(1) @call_assert_align() {
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
-; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 7e6f500181ec6..2c1beb8468576 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -238,8 +238,8 @@ define void @func_caller_stack() #2 {
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -277,8 +277,8 @@ define void @func_caller_stack() #2 {
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -363,8 +363,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
; MUBUF-NEXT: s_waitcnt vmcnt(1)
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -414,8 +414,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:56
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 72766f47030cc..35591cd602992 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -244,8 +244,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 3194581fa4213..0e24430e7be20 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -30,8 +30,8 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
+; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: s_mov_b32 s32, s33
; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2
; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index c78544bee46a2..60ce2ce2d99ae 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -35,8 +35,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
; DAGISEL-NEXT: s_mov_b32 s32, s33
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -78,8 +78,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GISEL-NEXT: scratch_load_b32 v41, off, s33
; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GISEL-NEXT: v_readlane_b32 s31, v42, 1
; GISEL-NEXT: v_readlane_b32 s30, v42, 0
+; GISEL-NEXT: v_readlane_b32 s31, v42, 1
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s0, v42, 2
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -787,8 +787,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
; DAGISEL-NEXT: s_mov_b32 s32, s33
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -822,8 +822,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s0, v40, 2
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 4eb5380929661..eb5734a176fb7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -7048,69 +7048,69 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v21, s30, 0
-; SI-NEXT: v_writelane_b32 v21, s31, 1
-; SI-NEXT: v_writelane_b32 v21, s34, 2
-; SI-NEXT: v_writelane_b32 v21, s35, 3
-; SI-NEXT: v_writelane_b32 v21, s36, 4
-; SI-NEXT: v_writelane_b32 v21, s37, 5
-; SI-NEXT: v_writelane_b32 v21, s38, 6
-; SI-NEXT: v_writelane_b32 v21, s39, 7
-; SI-NEXT: v_writelane_b32 v21, s48, 8
-; SI-NEXT: v_writelane_b32 v21, s49, 9
-; SI-NEXT: v_writelane_b32 v21, s50, 10
-; SI-NEXT: v_writelane_b32 v21, s51, 11
-; SI-NEXT: v_writelane_b32 v21, s52, 12
-; SI-NEXT: v_writelane_b32 v21, s53, 13
-; SI-NEXT: v_writelane_b32 v21, s54, 14
-; SI-NEXT: v_writelane_b32 v21, s55, 15
-; SI-NEXT: v_writelane_b32 v21, s64, 16
-; SI-NEXT: v_writelane_b32 v21, s65, 17
-; SI-NEXT: v_writelane_b32 v21, s66, 18
-; SI-NEXT: v_writelane_b32 v21, s67, 19
-; SI-NEXT: v_writelane_b32 v21, s68, 20
+; SI-NEXT: v_writelane_b32 v21, s34, 0
+; SI-NEXT: v_writelane_b32 v21, s35, 1
+; SI-NEXT: v_writelane_b32 v21, s36, 2
+; SI-NEXT: v_writelane_b32 v21, s37, 3
+; SI-NEXT: v_writelane_b32 v21, s38, 4
+; SI-NEXT: v_writelane_b32 v21, s39, 5
+; SI-NEXT: v_writelane_b32 v21, s48, 6
+; SI-NEXT: v_writelane_b32 v21, s49, 7
+; SI-NEXT: v_writelane_b32 v21, s50, 8
+; SI-NEXT: v_writelane_b32 v21, s51, 9
+; SI-NEXT: v_writelane_b32 v21, s52, 10
+; SI-NEXT: v_writelane_b32 v21, s53, 11
+; SI-NEXT: v_writelane_b32 v21, s54, 12
+; SI-NEXT: v_writelane_b32 v21, s55, 13
+; SI-NEXT: v_writelane_b32 v21, s64, 14
+; SI-NEXT: v_writelane_b32 v21, s65, 15
+; SI-NEXT: v_writelane_b32 v21, s66, 16
+; SI-NEXT: v_writelane_b32 v21, s67, 17
+; SI-NEXT: v_writelane_b32 v21, s68, 18
+; SI-NEXT: v_writelane_b32 v21, s69, 19
+; SI-NEXT: v_writelane_b32 v21, s70, 20
; SI-NEXT: v_mov_b32_e32 v20, s16
-; SI-NEXT: v_writelane_b32 v21, s69, 21
+; SI-NEXT: v_writelane_b32 v21, s71, 21
; SI-NEXT: v_readfirstlane_b32 s56, v20
; SI-NEXT: v_mov_b32_e32 v20, s17
-; SI-NEXT: v_writelane_b32 v21, s70, 22
+; SI-NEXT: v_writelane_b32 v21, s80, 22
; SI-NEXT: v_readfirstlane_b32 s57, v20
; SI-NEXT: v_mov_b32_e32 v20, s18
-; SI-NEXT: v_writelane_b32 v21, s71, 23
+; SI-NEXT: v_writelane_b32 v21, s81, 23
; SI-NEXT: v_readfirstlane_b32 s46, v20
; SI-NEXT: v_mov_b32_e32 v20, s19
-; SI-NEXT: v_writelane_b32 v21, s80, 24
+; SI-NEXT: v_writelane_b32 v21, s82, 24
; SI-NEXT: v_readfirstlane_b32 s47, v20
; SI-NEXT: v_mov_b32_e32 v20, s20
-; SI-NEXT: v_writelane_b32 v21, s81, 25
+; SI-NEXT: v_writelane_b32 v21, s83, 25
; SI-NEXT: v_readfirstlane_b32 s44, v20
; SI-NEXT: v_mov_b32_e32 v20, s21
-; SI-NEXT: v_writelane_b32 v21, s82, 26
+; SI-NEXT: v_writelane_b32 v21, s84, 26
; SI-NEXT: v_readfirstlane_b32 s45, v20
; SI-NEXT: v_mov_b32_e32 v20, s22
-; SI-NEXT: v_writelane_b32 v21, s83, 27
+; SI-NEXT: v_writelane_b32 v21, s85, 27
; SI-NEXT: v_readfirstlane_b32 s42, v20
; SI-NEXT: v_mov_b32_e32 v20, s23
-; SI-NEXT: v_writelane_b32 v21, s84, 28
+; SI-NEXT: v_writelane_b32 v21, s86, 28
; SI-NEXT: v_readfirstlane_b32 s43, v20
; SI-NEXT: v_mov_b32_e32 v20, s24
-; SI-NEXT: v_writelane_b32 v21, s85, 29
+; SI-NEXT: v_writelane_b32 v21, s87, 29
; SI-NEXT: v_readfirstlane_b32 s40, v20
; SI-NEXT: v_mov_b32_e32 v20, s25
-; SI-NEXT: v_writelane_b32 v21, s86, 30
+; SI-NEXT: v_writelane_b32 v21, s96, 30
; SI-NEXT: v_readfirstlane_b32 s41, v20
; SI-NEXT: v_mov_b32_e32 v20, s26
-; SI-NEXT: v_writelane_b32 v21, s87, 31
+; SI-NEXT: v_writelane_b32 v21, s97, 31
; SI-NEXT: v_readfirstlane_b32 s24, v20
; SI-NEXT: v_mov_b32_e32 v20, s27
-; SI-NEXT: v_writelane_b32 v21, s96, 32
+; SI-NEXT: v_writelane_b32 v21, s98, 32
; SI-NEXT: v_readfirstlane_b32 s25, v20
; SI-NEXT: v_mov_b32_e32 v20, s28
-; SI-NEXT: v_writelane_b32 v21, s97, 33
+; SI-NEXT: v_writelane_b32 v21, s99, 33
; SI-NEXT: v_readfirstlane_b32 s22, v20
; SI-NEXT: v_mov_b32_e32 v20, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT: v_writelane_b32 v21, s98, 34
+; SI-NEXT: v_writelane_b32 v21, s30, 34
; SI-NEXT: v_readfirstlane_b32 s23, v20
; SI-NEXT: v_readfirstlane_b32 s20, v1
; SI-NEXT: v_readfirstlane_b32 s21, v2
@@ -7131,7 +7131,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v17
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v18
-; SI-NEXT: v_writelane_b32 v21, s99, 35
+; SI-NEXT: v_writelane_b32 v21, s31, 35
; SI-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB13_4
@@ -8000,6 +8000,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v21, 34
; SI-NEXT: v_readlane_b32 s21, v23, 5
; SI-NEXT: v_readlane_b32 s19, v23, 11
; SI-NEXT: v_readlane_b32 s17, v23, 17
@@ -8008,42 +8009,41 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_readlane_b32 s11, v23, 35
; SI-NEXT: v_readlane_b32 s9, v23, 41
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s99, v21, 35
-; SI-NEXT: v_readlane_b32 s98, v21, 34
-; SI-NEXT: v_readlane_b32 s97, v21, 33
-; SI-NEXT: v_readlane_b32 s96, v21, 32
-; SI-NEXT: v_readlane_b32 s87, v21, 31
-; SI-NEXT: v_readlane_b32 s86, v21, 30
-; SI-NEXT: v_readlane_b32 s85, v21, 29
-; SI-NEXT: v_readlane_b32 s84, v21, 28
-; SI-NEXT: v_readlane_b32 s83, v21, 27
-; SI-NEXT: v_readlane_b32 s82, v21, 26
-; SI-NEXT: v_readlane_b32 s81, v21, 25
-; SI-NEXT: v_readlane_b32 s80, v21, 24
-; SI-NEXT: v_readlane_b32 s71, v21, 23
-; SI-NEXT: v_readlane_b32 s70, v21, 22
-; SI-NEXT: v_readlane_b32 s69, v21, 21
-; SI-NEXT: v_readlane_b32 s68, v21, 20
-; SI-NEXT: v_readlane_b32 s67, v21, 19
-; SI-NEXT: v_readlane_b32 s66, v21, 18
-; SI-NEXT: v_readlane_b32 s65, v21, 17
-; SI-NEXT: v_readlane_b32 s64, v21, 16
-; SI-NEXT: v_readlane_b32 s55, v21, 15
-; SI-NEXT: v_readlane_b32 s54, v21, 14
-; SI-NEXT: v_readlane_b32 s53, v21, 13
-; SI-NEXT: v_readlane_b32 s52, v21, 12
-; SI-NEXT: v_readlane_b32 s51, v21, 11
-; SI-NEXT: v_readlane_b32 s50, v21, 10
-; SI-NEXT: v_readlane_b32 s49, v21, 9
-; SI-NEXT: v_readlane_b32 s48, v21, 8
-; SI-NEXT: v_readlane_b32 s39, v21, 7
-; SI-NEXT: v_readlane_b32 s38, v21, 6
-; SI-NEXT: v_readlane_b32 s37, v21, 5
-; SI-NEXT: v_readlane_b32 s36, v21, 4
-; SI-NEXT: v_readlane_b32 s35, v21, 3
-; SI-NEXT: v_readlane_b32 s34, v21, 2
-; SI-NEXT: v_readlane_b32 s31, v21, 1
-; SI-NEXT: v_readlane_b32 s30, v21, 0
+; SI-NEXT: v_readlane_b32 s31, v21, 35
+; SI-NEXT: v_readlane_b32 s99, v21, 33
+; SI-NEXT: v_readlane_b32 s98, v21, 32
+; SI-NEXT: v_readlane_b32 s97, v21, 31
+; SI-NEXT: v_readlane_b32 s96, v21, 30
+; SI-NEXT: v_readlane_b32 s87, v21, 29
+; SI-NEXT: v_readlane_b32 s86, v21, 28
+; SI-NEXT: v_readlane_b32 s85, v21, 27
+; SI-NEXT: v_readlane_b32 s84, v21, 26
+; SI-NEXT: v_readlane_b32 s83, v21, 25
+; SI-NEXT: v_readlane_b32 s82, v21, 24
+; SI-NEXT: v_readlane_b32 s81, v21, 23
+; SI-NEXT: v_readlane_b32 s80, v21, 22
+; SI-NEXT: v_readlane_b32 s71, v21, 21
+; SI-NEXT: v_readlane_b32 s70, v21, 20
+; SI-NEXT: v_readlane_b32 s69, v21, 19
+; SI-NEXT: v_readlane_b32 s68, v21, 18
+; SI-NEXT: v_readlane_b32 s67, v21, 17
+; SI-NEXT: v_readlane_b32 s66, v21, 16
+; SI-NEXT: v_readlane_b32 s65, v21, 15
+; SI-NEXT: v_readlane_b32 s64, v21, 14
+; SI-NEXT: v_readlane_b32 s55, v21, 13
+; SI-NEXT: v_readlane_b32 s54, v21, 12
+; SI-NEXT: v_readlane_b32 s53, v21, 11
+; SI-NEXT: v_readlane_b32 s52, v21, 10
+; SI-NEXT: v_readlane_b32 s51, v21, 9
+; SI-NEXT: v_readlane_b32 s50, v21, 8
+; SI-NEXT: v_readlane_b32 s49, v21, 7
+; SI-NEXT: v_readlane_b32 s48, v21, 6
+; SI-NEXT: v_readlane_b32 s39, v21, 5
+; SI-NEXT: v_readlane_b32 s38, v21, 4
+; SI-NEXT: v_readlane_b32 s37, v21, 3
+; SI-NEXT: v_readlane_b32 s36, v21, 2
+; SI-NEXT: v_readlane_b32 s35, v21, 1
+; SI-NEXT: v_readlane_b32 s34, v21, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -8240,65 +8240,65 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v32, s30, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 1
-; VI-NEXT: v_writelane_b32 v32, s34, 2
-; VI-NEXT: v_writelane_b32 v32, s35, 3
-; VI-NEXT: v_writelane_b32 v32, s36, 4
-; VI-NEXT: v_writelane_b32 v32, s37, 5
-; VI-NEXT: v_writelane_b32 v32, s38, 6
-; VI-NEXT: v_writelane_b32 v32, s39, 7
-; VI-NEXT: v_writelane_b32 v32, s48, 8
-; VI-NEXT: v_writelane_b32 v32, s49, 9
-; VI-NEXT: v_writelane_b32 v32, s50, 10
-; VI-NEXT: v_writelane_b32 v32, s51, 11
-; VI-NEXT: v_writelane_b32 v32, s52, 12
-; VI-NEXT: v_writelane_b32 v32, s53, 13
-; VI-NEXT: v_writelane_b32 v32, s54, 14
-; VI-NEXT: v_writelane_b32 v32, s55, 15
-; VI-NEXT: v_writelane_b32 v32, s64, 16
+; VI-NEXT: v_writelane_b32 v32, s34, 0
+; VI-NEXT: v_writelane_b32 v32, s35, 1
+; VI-NEXT: v_writelane_b32 v32, s36, 2
+; VI-NEXT: v_writelane_b32 v32, s37, 3
+; VI-NEXT: v_writelane_b32 v32, s38, 4
+; VI-NEXT: v_writelane_b32 v32, s39, 5
+; VI-NEXT: v_writelane_b32 v32, s48, 6
+; VI-NEXT: v_writelane_b32 v32, s49, 7
+; VI-NEXT: v_writelane_b32 v32, s50, 8
+; VI-NEXT: v_writelane_b32 v32, s51, 9
+; VI-NEXT: v_writelane_b32 v32, s52, 10
+; VI-NEXT: v_writelane_b32 v32, s53, 11
+; VI-NEXT: v_writelane_b32 v32, s54, 12
+; VI-NEXT: v_writelane_b32 v32, s55, 13
+; VI-NEXT: v_writelane_b32 v32, s64, 14
+; VI-NEXT: v_writelane_b32 v32, s65, 15
+; VI-NEXT: v_writelane_b32 v32, s66, 16
; VI-NEXT: v_mov_b32_e32 v20, s16
-; VI-NEXT: v_writelane_b32 v32, s65, 17
+; VI-NEXT: v_writelane_b32 v32, s67, 17
; VI-NEXT: v_readfirstlane_b32 s56, v20
; VI-NEXT: v_mov_b32_e32 v20, s17
-; VI-NEXT: v_writelane_b32 v32, s66, 18
+; VI-NEXT: v_writelane_b32 v32, s68, 18
; VI-NEXT: v_readfirstlane_b32 s57, v20
; VI-NEXT: v_mov_b32_e32 v20, s18
-; VI-NEXT: v_writelane_b32 v32, s67, 19
+; VI-NEXT: v_writelane_b32 v32, s69, 19
; VI-NEXT: v_readfirstlane_b32 s46, v20
; VI-NEXT: v_mov_b32_e32 v20, s19
-; VI-NEXT: v_writelane_b32 v32, s68, 20
+; VI-NEXT: v_writelane_b32 v32, s70, 20
; VI-NEXT: v_readfirstlane_b32 s47, v20
; VI-NEXT: v_mov_b32_e32 v20, s20
-; VI-NEXT: v_writelane_b32 v32, s69, 21
+; VI-NEXT: v_writelane_b32 v32, s71, 21
; VI-NEXT: v_readfirstlane_b32 s44, v20
; VI-NEXT: v_mov_b32_e32 v20, s21
-; VI-NEXT: v_writelane_b32 v32, s70, 22
+; VI-NEXT: v_writelane_b32 v32, s80, 22
; VI-NEXT: v_readfirstlane_b32 s45, v20
; VI-NEXT: v_mov_b32_e32 v20, s22
-; VI-NEXT: v_writelane_b32 v32, s71, 23
+; VI-NEXT: v_writelane_b32 v32, s81, 23
; VI-NEXT: v_readfirstlane_b32 s42, v20
; VI-NEXT: v_mov_b32_e32 v20, s23
-; VI-NEXT: v_writelane_b32 v32, s80, 24
+; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_readfirstlane_b32 s43, v20
; VI-NEXT: v_mov_b32_e32 v20, s24
-; VI-NEXT: v_writelane_b32 v32, s81, 25
+; VI-NEXT: v_writelane_b32 v32, s83, 25
; VI-NEXT: v_readfirstlane_b32 s40, v20
; VI-NEXT: v_mov_b32_e32 v20, s25
-; VI-NEXT: v_writelane_b32 v32, s82, 26
+; VI-NEXT: v_writelane_b32 v32, s84, 26
; VI-NEXT: v_readfirstlane_b32 s41, v20
; VI-NEXT: v_mov_b32_e32 v20, s26
-; VI-NEXT: v_writelane_b32 v32, s83, 27
+; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_readfirstlane_b32 s24, v20
; VI-NEXT: v_mov_b32_e32 v20, s27
-; VI-NEXT: v_writelane_b32 v32, s84, 28
+; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: v_readfirstlane_b32 s25, v20
; VI-NEXT: v_mov_b32_e32 v20, s28
-; VI-NEXT: v_writelane_b32 v32, s85, 29
+; VI-NEXT: v_writelane_b32 v32, s87, 29
; VI-NEXT: v_readfirstlane_b32 s22, v20
; VI-NEXT: v_mov_b32_e32 v20, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; VI-NEXT: v_writelane_b32 v32, s86, 30
+; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: v_readfirstlane_b32 s23, v20
; VI-NEXT: v_readfirstlane_b32 s20, v1
; VI-NEXT: v_readfirstlane_b32 s21, v2
@@ -8319,7 +8319,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: s_and_b64 s[26:27], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v18
-; VI-NEXT: v_writelane_b32 v32, s87, 31
+; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB13_4
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -8978,40 +8978,40 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: v_readlane_b32 s30, v32, 30
; VI-NEXT: v_readlane_b32 s7, v33, 1
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s87, v32, 31
-; VI-NEXT: v_readlane_b32 s86, v32, 30
-; VI-NEXT: v_readlane_b32 s85, v32, 29
-; VI-NEXT: v_readlane_b32 s84, v32, 28
-; VI-NEXT: v_readlane_b32 s83, v32, 27
-; VI-NEXT: v_readlane_b32 s82, v32, 26
-; VI-NEXT: v_readlane_b32 s81, v32, 25
-; VI-NEXT: v_readlane_b32 s80, v32, 24
-; VI-NEXT: v_readlane_b32 s71, v32, 23
-; VI-NEXT: v_readlane_b32 s70, v32, 22
-; VI-NEXT: v_readlane_b32 s69, v32, 21
-; VI-NEXT: v_readlane_b32 s68, v32, 20
-; VI-NEXT: v_readlane_b32 s67, v32, 19
-; VI-NEXT: v_readlane_b32 s66, v32, 18
-; VI-NEXT: v_readlane_b32 s65, v32, 17
-; VI-NEXT: v_readlane_b32 s64, v32, 16
-; VI-NEXT: v_readlane_b32 s55, v32, 15
-; VI-NEXT: v_readlane_b32 s54, v32, 14
-; VI-NEXT: v_readlane_b32 s53, v32, 13
-; VI-NEXT: v_readlane_b32 s52, v32, 12
-; VI-NEXT: v_readlane_b32 s51, v32, 11
-; VI-NEXT: v_readlane_b32 s50, v32, 10
-; VI-NEXT: v_readlane_b32 s49, v32, 9
-; VI-NEXT: v_readlane_b32 s48, v32, 8
-; VI-NEXT: v_readlane_b32 s39, v32, 7
-; VI-NEXT: v_readlane_b32 s38, v32, 6
-; VI-NEXT: v_readlane_b32 s37, v32, 5
-; VI-NEXT: v_readlane_b32 s36, v32, 4
-; VI-NEXT: v_readlane_b32 s35, v32, 3
-; VI-NEXT: v_readlane_b32 s34, v32, 2
-; VI-NEXT: v_readlane_b32 s31, v32, 1
-; VI-NEXT: v_readlane_b32 s30, v32, 0
+; VI-NEXT: v_readlane_b32 s31, v32, 31
+; VI-NEXT: v_readlane_b32 s87, v32, 29
+; VI-NEXT: v_readlane_b32 s86, v32, 28
+; VI-NEXT: v_readlane_b32 s85, v32, 27
+; VI-NEXT: v_readlane_b32 s84, v32, 26
+; VI-NEXT: v_readlane_b32 s83, v32, 25
+; VI-NEXT: v_readlane_b32 s82, v32, 24
+; VI-NEXT: v_readlane_b32 s81, v32, 23
+; VI-NEXT: v_readlane_b32 s80, v32, 22
+; VI-NEXT: v_readlane_b32 s71, v32, 21
+; VI-NEXT: v_readlane_b32 s70, v32, 20
+; VI-NEXT: v_readlane_b32 s69, v32, 19
+; VI-NEXT: v_readlane_b32 s68, v32, 18
+; VI-NEXT: v_readlane_b32 s67, v32, 17
+; VI-NEXT: v_readlane_b32 s66, v32, 16
+; VI-NEXT: v_readlane_b32 s65, v32, 15
+; VI-NEXT: v_readlane_b32 s64, v32, 14
+; VI-NEXT: v_readlane_b32 s55, v32, 13
+; VI-NEXT: v_readlane_b32 s54, v32, 12
+; VI-NEXT: v_readlane_b32 s53, v32, 11
+; VI-NEXT: v_readlane_b32 s52, v32, 10
+; VI-NEXT: v_readlane_b32 s51, v32, 9
+; VI-NEXT: v_readlane_b32 s50, v32, 8
+; VI-NEXT: v_readlane_b32 s49, v32, 7
+; VI-NEXT: v_readlane_b32 s48, v32, 6
+; VI-NEXT: v_readlane_b32 s39, v32, 5
+; VI-NEXT: v_readlane_b32 s38, v32, 4
+; VI-NEXT: v_readlane_b32 s37, v32, 3
+; VI-NEXT: v_readlane_b32 s36, v32, 2
+; VI-NEXT: v_readlane_b32 s35, v32, 1
+; VI-NEXT: v_readlane_b32 s34, v32, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -9182,69 +9182,69 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v29, s30, 0
-; GFX9-NEXT: v_writelane_b32 v29, s31, 1
-; GFX9-NEXT: v_writelane_b32 v29, s34, 2
-; GFX9-NEXT: v_writelane_b32 v29, s35, 3
-; GFX9-NEXT: v_writelane_b32 v29, s36, 4
-; GFX9-NEXT: v_writelane_b32 v29, s37, 5
-; GFX9-NEXT: v_writelane_b32 v29, s38, 6
-; GFX9-NEXT: v_writelane_b32 v29, s39, 7
-; GFX9-NEXT: v_writelane_b32 v29, s48, 8
-; GFX9-NEXT: v_writelane_b32 v29, s49, 9
-; GFX9-NEXT: v_writelane_b32 v29, s50, 10
-; GFX9-NEXT: v_writelane_b32 v29, s51, 11
-; GFX9-NEXT: v_writelane_b32 v29, s52, 12
-; GFX9-NEXT: v_writelane_b32 v29, s53, 13
-; GFX9-NEXT: v_writelane_b32 v29, s54, 14
-; GFX9-NEXT: v_writelane_b32 v29, s55, 15
-; GFX9-NEXT: v_writelane_b32 v29, s64, 16
-; GFX9-NEXT: v_writelane_b32 v29, s65, 17
-; GFX9-NEXT: v_writelane_b32 v29, s66, 18
-; GFX9-NEXT: v_writelane_b32 v29, s67, 19
-; GFX9-NEXT: v_writelane_b32 v29, s68, 20
+; GFX9-NEXT: v_writelane_b32 v29, s34, 0
+; GFX9-NEXT: v_writelane_b32 v29, s35, 1
+; GFX9-NEXT: v_writelane_b32 v29, s36, 2
+; GFX9-NEXT: v_writelane_b32 v29, s37, 3
+; GFX9-NEXT: v_writelane_b32 v29, s38, 4
+; GFX9-NEXT: v_writelane_b32 v29, s39, 5
+; GFX9-NEXT: v_writelane_b32 v29, s48, 6
+; GFX9-NEXT: v_writelane_b32 v29, s49, 7
+; GFX9-NEXT: v_writelane_b32 v29, s50, 8
+; GFX9-NEXT: v_writelane_b32 v29, s51, 9
+; GFX9-NEXT: v_writelane_b32 v29, s52, 10
+; GFX9-NEXT: v_writelane_b32 v29, s53, 11
+; GFX9-NEXT: v_writelane_b32 v29, s54, 12
+; GFX9-NEXT: v_writelane_b32 v29, s55, 13
+; GFX9-NEXT: v_writelane_b32 v29, s64, 14
+; GFX9-NEXT: v_writelane_b32 v29, s65, 15
+; GFX9-NEXT: v_writelane_b32 v29, s66, 16
+; GFX9-NEXT: v_writelane_b32 v29, s67, 17
+; GFX9-NEXT: v_writelane_b32 v29, s68, 18
+; GFX9-NEXT: v_writelane_b32 v29, s69, 19
+; GFX9-NEXT: v_writelane_b32 v29, s70, 20
; GFX9-NEXT: v_mov_b32_e32 v20, s16
-; GFX9-NEXT: v_writelane_b32 v29, s69, 21
+; GFX9-NEXT: v_writelane_b32 v29, s71, 21
; GFX9-NEXT: v_readfirstlane_b32 s56, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s17
-; GFX9-NEXT: v_writelane_b32 v29, s70, 22
+; GFX9-NEXT: v_writelane_b32 v29, s80, 22
; GFX9-NEXT: v_readfirstlane_b32 s57, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s18
-; GFX9-NEXT: v_writelane_b32 v29, s71, 23
+; GFX9-NEXT: v_writelane_b32 v29, s81, 23
; GFX9-NEXT: v_readfirstlane_b32 s46, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s19
-; GFX9-NEXT: v_writelane_b32 v29, s80, 24
+; GFX9-NEXT: v_writelane_b32 v29, s82, 24
; GFX9-NEXT: v_readfirstlane_b32 s47, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s20
-; GFX9-NEXT: v_writelane_b32 v29, s81, 25
+; GFX9-NEXT: v_writelane_b32 v29, s83, 25
; GFX9-NEXT: v_readfirstlane_b32 s44, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s21
-; GFX9-NEXT: v_writelane_b32 v29, s82, 26
+; GFX9-NEXT: v_writelane_b32 v29, s84, 26
; GFX9-NEXT: v_readfirstlane_b32 s45, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s22
-; GFX9-NEXT: v_writelane_b32 v29, s83, 27
+; GFX9-NEXT: v_writelane_b32 v29, s85, 27
; GFX9-NEXT: v_readfirstlane_b32 s42, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s23
-; GFX9-NEXT: v_writelane_b32 v29, s84, 28
+; GFX9-NEXT: v_writelane_b32 v29, s86, 28
; GFX9-NEXT: v_readfirstlane_b32 s43, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s24
-; GFX9-NEXT: v_writelane_b32 v29, s85, 29
+; GFX9-NEXT: v_writelane_b32 v29, s87, 29
; GFX9-NEXT: v_readfirstlane_b32 s40, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s25
-; GFX9-NEXT: v_writelane_b32 v29, s86, 30
+; GFX9-NEXT: v_writelane_b32 v29, s96, 30
; GFX9-NEXT: v_readfirstlane_b32 s41, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s26
-; GFX9-NEXT: v_writelane_b32 v29, s87, 31
+; GFX9-NEXT: v_writelane_b32 v29, s97, 31
; GFX9-NEXT: v_readfirstlane_b32 s24, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s27
-; GFX9-NEXT: v_writelane_b32 v29, s96, 32
+; GFX9-NEXT: v_writelane_b32 v29, s98, 32
; GFX9-NEXT: v_readfirstlane_b32 s25, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s28
-; GFX9-NEXT: v_writelane_b32 v29, s97, 33
+; GFX9-NEXT: v_writelane_b32 v29, s99, 33
; GFX9-NEXT: v_readfirstlane_b32 s22, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; GFX9-NEXT: v_writelane_b32 v29, s98, 34
+; GFX9-NEXT: v_writelane_b32 v29, s30, 34
; GFX9-NEXT: v_readfirstlane_b32 s23, v20
; GFX9-NEXT: v_readfirstlane_b32 s20, v1
; GFX9-NEXT: v_readfirstlane_b32 s21, v2
@@ -9265,7 +9265,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s4, v17
; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v18
-; GFX9-NEXT: v_writelane_b32 v29, s99, 35
+; GFX9-NEXT: v_writelane_b32 v29, s31, 35
; GFX9-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB13_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -9870,43 +9870,43 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX9-NEXT: v_perm_b32 v1, s4, v3, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT: v_readlane_b32 s30, v29, 34
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: v_readlane_b32 s99, v29, 35
-; GFX9-NEXT: v_readlane_b32 s98, v29, 34
-; GFX9-NEXT: v_readlane_b32 s97, v29, 33
-; GFX9-NEXT: v_readlane_b32 s96, v29, 32
-; GFX9-NEXT: v_readlane_b32 s87, v29, 31
-; GFX9-NEXT: v_readlane_b32 s86, v29, 30
-; GFX9-NEXT: v_readlane_b32 s85, v29, 29
-; GFX9-NEXT: v_readlane_b32 s84, v29, 28
-; GFX9-NEXT: v_readlane_b32 s83, v29, 27
-; GFX9-NEXT: v_readlane_b32 s82, v29, 26
-; GFX9-NEXT: v_readlane_b32 s81, v29, 25
-; GFX9-NEXT: v_readlane_b32 s80, v29, 24
-; GFX9-NEXT: v_readlane_b32 s71, v29, 23
-; GFX9-NEXT: v_readlane_b32 s70, v29, 22
-; GFX9-NEXT: v_readlane_b32 s69, v29, 21
-; GFX9-NEXT: v_readlane_b32 s68, v29, 20
-; GFX9-NEXT: v_readlane_b32 s67, v29, 19
-; GFX9-NEXT: v_readlane_b32 s66, v29, 18
-; GFX9-NEXT: v_readlane_b32 s65, v29, 17
-; GFX9-NEXT: v_readlane_b32 s64, v29, 16
-; GFX9-NEXT: v_readlane_b32 s55, v29, 15
-; GFX9-NEXT: v_readlane_b32 s54, v29, 14
-; GFX9-NEXT: v_readlane_b32 s53, v29, 13
-; GFX9-NEXT: v_readlane_b32 s52, v29, 12
-; GFX9-NEXT: v_readlane_b32 s51, v29, 11
-; GFX9-NEXT: v_readlane_b32 s50, v29, 10
-; GFX9-NEXT: v_readlane_b32 s49, v29, 9
-; GFX9-NEXT: v_readlane_b32 s48, v29, 8
-; GFX9-NEXT: v_readlane_b32 s39, v29, 7
-; GFX9-NEXT: v_readlane_b32 s38, v29, 6
-; GFX9-NEXT: v_readlane_b32 s37, v29, 5
-; GFX9-NEXT: v_readlane_b32 s36, v29, 4
-; GFX9-NEXT: v_readlane_b32 s35, v29, 3
-; GFX9-NEXT: v_readlane_b32 s34, v29, 2
-; GFX9-NEXT: v_readlane_b32 s31, v29, 1
-; GFX9-NEXT: v_readlane_b32 s30, v29, 0
+; GFX9-NEXT: v_readlane_b32 s31, v29, 35
+; GFX9-NEXT: v_readlane_b32 s99, v29, 33
+; GFX9-NEXT: v_readlane_b32 s98, v29, 32
+; GFX9-NEXT: v_readlane_b32 s97, v29, 31
+; GFX9-NEXT: v_readlane_b32 s96, v29, 30
+; GFX9-NEXT: v_readlane_b32 s87, v29, 29
+; GFX9-NEXT: v_readlane_b32 s86, v29, 28
+; GFX9-NEXT: v_readlane_b32 s85, v29, 27
+; GFX9-NEXT: v_readlane_b32 s84, v29, 26
+; GFX9-NEXT: v_readlane_b32 s83, v29, 25
+; GFX9-NEXT: v_readlane_b32 s82, v29, 24
+; GFX9-NEXT: v_readlane_b32 s81, v29, 23
+; GFX9-NEXT: v_readlane_b32 s80, v29, 22
+; GFX9-NEXT: v_readlane_b32 s71, v29, 21
+; GFX9-NEXT: v_readlane_b32 s70, v29, 20
+; GFX9-NEXT: v_readlane_b32 s69, v29, 19
+; GFX9-NEXT: v_readlane_b32 s68, v29, 18
+; GFX9-NEXT: v_readlane_b32 s67, v29, 17
+; GFX9-NEXT: v_readlane_b32 s66, v29, 16
+; GFX9-NEXT: v_readlane_b32 s65, v29, 15
+; GFX9-NEXT: v_readlane_b32 s64, v29, 14
+; GFX9-NEXT: v_readlane_b32 s55, v29, 13
+; GFX9-NEXT: v_readlane_b32 s54, v29, 12
+; GFX9-NEXT: v_readlane_b32 s53, v29, 11
+; GFX9-NEXT: v_readlane_b32 s52, v29, 10
+; GFX9-NEXT: v_readlane_b32 s51, v29, 9
+; GFX9-NEXT: v_readlane_b32 s50, v29, 8
+; GFX9-NEXT: v_readlane_b32 s49, v29, 7
+; GFX9-NEXT: v_readlane_b32 s48, v29, 6
+; GFX9-NEXT: v_readlane_b32 s39, v29, 5
+; GFX9-NEXT: v_readlane_b32 s38, v29, 4
+; GFX9-NEXT: v_readlane_b32 s37, v29, 3
+; GFX9-NEXT: v_readlane_b32 s36, v29, 2
+; GFX9-NEXT: v_readlane_b32 s35, v29, 1
+; GFX9-NEXT: v_readlane_b32 s34, v29, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -10072,93 +10072,93 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v36, s32 offset:8
; GFX11-NEXT: scratch_store_b32 off, v37, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v34, s30, 0
-; GFX11-NEXT: v_writelane_b32 v35, s96, 0
+; GFX11-NEXT: v_writelane_b32 v34, s34, 0
+; GFX11-NEXT: v_writelane_b32 v35, s98, 0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: v_writelane_b32 v34, s31, 1
-; GFX11-NEXT: v_writelane_b32 v35, s97, 1
+; GFX11-NEXT: v_writelane_b32 v34, s35, 1
+; GFX11-NEXT: v_writelane_b32 v35, s99, 1
; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT: v_writelane_b32 v34, s34, 2
-; GFX11-NEXT: v_writelane_b32 v35, s98, 2
+; GFX11-NEXT: v_writelane_b32 v34, s36, 2
+; GFX11-NEXT: v_writelane_b32 v35, s100, 2
; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21
; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23
-; GFX11-NEXT: v_writelane_b32 v34, s35, 3
-; GFX11-NEXT: v_writelane_b32 v35, s99, 3
+; GFX11-NEXT: v_writelane_b32 v34, s37, 3
+; GFX11-NEXT: v_writelane_b32 v35, s101, 3
; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25
; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27
-; GFX11-NEXT: v_writelane_b32 v34, s36, 4
-; GFX11-NEXT: v_writelane_b32 v35, s100, 4
+; GFX11-NEXT: v_writelane_b32 v34, s38, 4
+; GFX11-NEXT: v_writelane_b32 v35, s102, 4
; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT: v_writelane_b32 v34, s37, 5
-; GFX11-NEXT: v_writelane_b32 v35, s101, 5
+; GFX11-NEXT: v_writelane_b32 v34, s39, 5
+; GFX11-NEXT: v_writelane_b32 v35, s103, 5
; GFX11-NEXT: v_readfirstlane_b32 s40, v16
; GFX11-NEXT: v_readfirstlane_b32 s41, v17
; GFX11-NEXT: v_readfirstlane_b32 s28, v18
-; GFX11-NEXT: v_writelane_b32 v34, s38, 6
-; GFX11-NEXT: v_writelane_b32 v35, s102, 6
+; GFX11-NEXT: v_writelane_b32 v34, s48, 6
+; GFX11-NEXT: v_writelane_b32 v35, s104, 6
; GFX11-NEXT: v_readfirstlane_b32 s29, v19
; GFX11-NEXT: v_readfirstlane_b32 s26, v20
; GFX11-NEXT: v_readfirstlane_b32 s27, v21
-; GFX11-NEXT: v_writelane_b32 v34, s39, 7
-; GFX11-NEXT: v_writelane_b32 v35, s103, 7
+; GFX11-NEXT: v_writelane_b32 v34, s49, 7
+; GFX11-NEXT: v_writelane_b32 v35, s30, 7
; GFX11-NEXT: v_readfirstlane_b32 s24, v22
; GFX11-NEXT: v_readfirstlane_b32 s25, v23
; GFX11-NEXT: v_readfirstlane_b32 s22, v24
-; GFX11-NEXT: v_writelane_b32 v34, s48, 8
+; GFX11-NEXT: v_writelane_b32 v34, s50, 8
; GFX11-NEXT: v_readfirstlane_b32 s23, v25
; GFX11-NEXT: v_readfirstlane_b32 s20, v26
; GFX11-NEXT: v_readfirstlane_b32 s21, v27
; GFX11-NEXT: v_readfirstlane_b32 s18, v28
-; GFX11-NEXT: v_writelane_b32 v34, s49, 9
+; GFX11-NEXT: v_writelane_b32 v34, s51, 9
; GFX11-NEXT: v_readfirstlane_b32 s19, v29
; GFX11-NEXT: v_readfirstlane_b32 s16, v30
; GFX11-NEXT: v_readfirstlane_b32 s17, v31
; GFX11-NEXT: v_readfirstlane_b32 s14, v32
-; GFX11-NEXT: v_writelane_b32 v34, s50, 10
+; GFX11-NEXT: v_writelane_b32 v34, s52, 10
; GFX11-NEXT: v_readfirstlane_b32 s15, v33
; GFX11-NEXT: v_readfirstlane_b32 s12, v1
; GFX11-NEXT: v_readfirstlane_b32 s13, v2
; GFX11-NEXT: v_readfirstlane_b32 s10, v3
-; GFX11-NEXT: v_writelane_b32 v34, s51, 11
+; GFX11-NEXT: v_writelane_b32 v34, s53, 11
; GFX11-NEXT: v_readfirstlane_b32 s11, v4
; GFX11-NEXT: v_readfirstlane_b32 s8, v5
; GFX11-NEXT: v_readfirstlane_b32 s9, v6
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
-; GFX11-NEXT: v_writelane_b32 v34, s52, 12
+; GFX11-NEXT: v_writelane_b32 v34, s54, 12
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s2, v11
-; GFX11-NEXT: v_writelane_b32 v34, s53, 13
+; GFX11-NEXT: v_writelane_b32 v34, s55, 13
; GFX11-NEXT: v_readfirstlane_b32 s3, v12
; GFX11-NEXT: v_readfirstlane_b32 s0, v13
; GFX11-NEXT: v_readfirstlane_b32 s1, v14
; GFX11-NEXT: s_mov_b32 vcc_hi, 0
-; GFX11-NEXT: v_writelane_b32 v34, s54, 14
+; GFX11-NEXT: v_writelane_b32 v34, s64, 14
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: v_writelane_b32 v35, s104, 8
+; GFX11-NEXT: v_writelane_b32 v35, s31, 8
; GFX11-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane
; GFX11-NEXT: ; implicit-def: $vgpr36 : SGPR spill to VGPR lane
-; GFX11-NEXT: v_writelane_b32 v34, s55, 15
-; GFX11-NEXT: v_writelane_b32 v34, s64, 16
-; GFX11-NEXT: v_writelane_b32 v34, s65, 17
-; GFX11-NEXT: v_writelane_b32 v34, s66, 18
-; GFX11-NEXT: v_writelane_b32 v34, s67, 19
-; GFX11-NEXT: v_writelane_b32 v34, s68, 20
-; GFX11-NEXT: v_writelane_b32 v34, s69, 21
-; GFX11-NEXT: v_writelane_b32 v34, s70, 22
-; GFX11-NEXT: v_writelane_b32 v34, s71, 23
-; GFX11-NEXT: v_writelane_b32 v34, s80, 24
-; GFX11-NEXT: v_writelane_b32 v34, s81, 25
-; GFX11-NEXT: v_writelane_b32 v34, s82, 26
-; GFX11-NEXT: v_writelane_b32 v34, s83, 27
-; GFX11-NEXT: v_writelane_b32 v34, s84, 28
-; GFX11-NEXT: v_writelane_b32 v34, s85, 29
-; GFX11-NEXT: v_writelane_b32 v34, s86, 30
-; GFX11-NEXT: v_writelane_b32 v34, s87, 31
+; GFX11-NEXT: v_writelane_b32 v34, s65, 15
+; GFX11-NEXT: v_writelane_b32 v34, s66, 16
+; GFX11-NEXT: v_writelane_b32 v34, s67, 17
+; GFX11-NEXT: v_writelane_b32 v34, s68, 18
+; GFX11-NEXT: v_writelane_b32 v34, s69, 19
+; GFX11-NEXT: v_writelane_b32 v34, s70, 20
+; GFX11-NEXT: v_writelane_b32 v34, s71, 21
+; GFX11-NEXT: v_writelane_b32 v34, s80, 22
+; GFX11-NEXT: v_writelane_b32 v34, s81, 23
+; GFX11-NEXT: v_writelane_b32 v34, s82, 24
+; GFX11-NEXT: v_writelane_b32 v34, s83, 25
+; GFX11-NEXT: v_writelane_b32 v34, s84, 26
+; GFX11-NEXT: v_writelane_b32 v34, s85, 27
+; GFX11-NEXT: v_writelane_b32 v34, s86, 28
+; GFX11-NEXT: v_writelane_b32 v34, s87, 29
+; GFX11-NEXT: v_writelane_b32 v34, s96, 30
+; GFX11-NEXT: v_writelane_b32 v34, s97, 31
; GFX11-NEXT: s_cbranch_scc0 .LBB13_2
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s72, s18, 16
@@ -10625,13 +10625,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: .LBB13_5: ; %end
; GFX11-NEXT: v_mov_b32_e32 v1, 0xc0c0004
; GFX11-NEXT: v_readlane_b32 s73, v36, 7
-; GFX11-NEXT: v_readlane_b32 s31, v34, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_perm_b32 v2, s73, s30, v1
; GFX11-NEXT: v_readlane_b32 s73, v36, 8
-; GFX11-NEXT: v_readlane_b32 s30, v34, 0
+; GFX11-NEXT: v_readlane_b32 s30, v35, 7
+; GFX11-NEXT: v_readlane_b32 s31, v35, 8
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_perm_b32 v3, s40, s73, v1
; GFX11-NEXT: v_readlane_b32 s40, v36, 6
; GFX11-NEXT: v_perm_b32 v22, s77, s80, v1
@@ -10640,7 +10640,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-NEXT: v_perm_b32 v4, s41, s40, v1
; GFX11-NEXT: v_readlane_b32 s40, v36, 2
-; GFX11-NEXT: v_readlane_b32 s80, v34, 24
+; GFX11-NEXT: v_readlane_b32 s80, v34, 22
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, s40, s92, v1
; GFX11-NEXT: v_readlane_b32 s40, v36, 3
@@ -10694,7 +10694,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_perm_b32 v6, s20, s104, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v14
; GFX11-NEXT: v_perm_b32 v14, s3, s39, v1
-; GFX11-NEXT: v_readlane_b32 s104, v35, 8
+; GFX11-NEXT: v_readlane_b32 s104, v35, 6
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v21
@@ -10707,8 +10707,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v5, v13, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v15
; GFX11-NEXT: v_perm_b32 v10, s19, s97, v1
-; GFX11-NEXT: v_readlane_b32 s97, v35, 1
-; GFX11-NEXT: v_readlane_b32 s39, v34, 7
+; GFX11-NEXT: v_readlane_b32 s97, v34, 31
+; GFX11-NEXT: v_readlane_b32 s39, v34, 5
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off offset:16
; GFX11-NEXT: v_perm_b32 v5, s102, s101, v1
; GFX11-NEXT: v_or_b32_e32 v3, v17, v8
@@ -10753,8 +10753,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v12
; GFX11-NEXT: v_or_b32_e32 v3, v10, v11
; GFX11-NEXT: v_perm_b32 v10, s63, s67, v1
-; GFX11-NEXT: v_readlane_b32 s103, v35, 7
-; GFX11-NEXT: v_readlane_b32 s102, v35, 6
+; GFX11-NEXT: v_readlane_b32 s103, v35, 5
+; GFX11-NEXT: v_readlane_b32 s102, v35, 4
; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-NEXT: v_perm_b32 v5, s13, s62, v1
; GFX11-NEXT: v_readlane_b32 s12, v37, 6
@@ -10766,7 +10766,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v5, v5, v7
; GFX11-NEXT: v_readlane_b32 s11, v37, 9
; GFX11-NEXT: v_perm_b32 v12, s58, s10, v1
-; GFX11-NEXT: v_readlane_b32 s101, v35, 5
+; GFX11-NEXT: v_readlane_b32 s101, v35, 3
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v6
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off offset:64
@@ -10777,8 +10777,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v12
; GFX11-NEXT: v_or_b32_e32 v7, v10, v11
; GFX11-NEXT: v_perm_b32 v10, s56, s57, v1
-; GFX11-NEXT: v_readlane_b32 s100, v35, 4
-; GFX11-NEXT: v_readlane_b32 s99, v35, 3
+; GFX11-NEXT: v_readlane_b32 s100, v35, 2
+; GFX11-NEXT: v_readlane_b32 s99, v35, 1
; GFX11-NEXT: v_or_b32_e32 v8, v8, v9
; GFX11-NEXT: v_perm_b32 v9, s9, s55, v1
; GFX11-NEXT: v_readlane_b32 s8, v37, 10
@@ -10790,12 +10790,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v9, v9, v3
; GFX11-NEXT: v_readlane_b32 s7, v37, 13
; GFX11-NEXT: v_perm_b32 v12, s51, s6, v1
-; GFX11-NEXT: v_readlane_b32 s98, v35, 2
+; GFX11-NEXT: v_readlane_b32 s98, v35, 0
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT: v_readlane_b32 s96, v35, 0
-; GFX11-NEXT: v_readlane_b32 s87, v34, 31
-; GFX11-NEXT: v_readlane_b32 s86, v34, 30
+; GFX11-NEXT: v_readlane_b32 s96, v34, 30
+; GFX11-NEXT: v_readlane_b32 s87, v34, 29
+; GFX11-NEXT: v_readlane_b32 s86, v34, 28
; GFX11-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-NEXT: v_perm_b32 v4, s4, s50, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v12
@@ -10809,51 +10809,51 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_readlane_b32 s5, v37, 15
; GFX11-NEXT: v_readlane_b32 s3, v37, 17
-; GFX11-NEXT: v_readlane_b32 s85, v34, 29
+; GFX11-NEXT: v_readlane_b32 s85, v34, 27
; GFX11-NEXT: v_perm_b32 v11, s42, s4, v1
; GFX11-NEXT: v_or_b32_e32 v5, v5, v10
; GFX11-NEXT: v_perm_b32 v10, s48, s38, v1
-; GFX11-NEXT: v_readlane_b32 s84, v34, 28
-; GFX11-NEXT: v_readlane_b32 s83, v34, 27
+; GFX11-NEXT: v_readlane_b32 s84, v34, 26
+; GFX11-NEXT: v_readlane_b32 s83, v34, 25
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_readlane_b32 s82, v34, 26
+; GFX11-NEXT: v_readlane_b32 s82, v34, 24
; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_readlane_b32 s81, v34, 25
-; GFX11-NEXT: v_readlane_b32 s71, v34, 23
+; GFX11-NEXT: v_readlane_b32 s81, v34, 23
+; GFX11-NEXT: v_readlane_b32 s71, v34, 21
; GFX11-NEXT: v_or_b32_e32 v10, v12, v11
; GFX11-NEXT: v_perm_b32 v12, s36, s2, v1
; GFX11-NEXT: v_or_b32_e32 v11, v14, v13
; GFX11-NEXT: v_perm_b32 v13, s0, s37, v1
-; GFX11-NEXT: v_readlane_b32 s70, v34, 22
-; GFX11-NEXT: v_readlane_b32 s69, v34, 21
+; GFX11-NEXT: v_readlane_b32 s70, v34, 20
+; GFX11-NEXT: v_readlane_b32 s69, v34, 19
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_readlane_b32 s68, v34, 20
-; GFX11-NEXT: v_readlane_b32 s67, v34, 19
-; GFX11-NEXT: v_readlane_b32 s66, v34, 18
-; GFX11-NEXT: v_readlane_b32 s65, v34, 17
+; GFX11-NEXT: v_readlane_b32 s68, v34, 18
+; GFX11-NEXT: v_readlane_b32 s67, v34, 17
+; GFX11-NEXT: v_readlane_b32 s66, v34, 16
+; GFX11-NEXT: v_readlane_b32 s65, v34, 15
; GFX11-NEXT: v_or_b32_e32 v12, v13, v12
; GFX11-NEXT: v_perm_b32 v13, s34, vcc_hi, v1
; GFX11-NEXT: v_perm_b32 v1, s1, s35, v1
-; GFX11-NEXT: v_readlane_b32 s64, v34, 16
-; GFX11-NEXT: v_readlane_b32 s55, v34, 15
-; GFX11-NEXT: v_readlane_b32 s54, v34, 14
+; GFX11-NEXT: v_readlane_b32 s64, v34, 14
+; GFX11-NEXT: v_readlane_b32 s55, v34, 13
+; GFX11-NEXT: v_readlane_b32 s54, v34, 12
; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT: v_readlane_b32 s53, v34, 13
-; GFX11-NEXT: v_readlane_b32 s52, v34, 12
-; GFX11-NEXT: v_readlane_b32 s51, v34, 11
-; GFX11-NEXT: v_readlane_b32 s50, v34, 10
+; GFX11-NEXT: v_readlane_b32 s53, v34, 11
+; GFX11-NEXT: v_readlane_b32 s52, v34, 10
+; GFX11-NEXT: v_readlane_b32 s51, v34, 9
+; GFX11-NEXT: v_readlane_b32 s50, v34, 8
; GFX11-NEXT: v_or_b32_e32 v13, v1, v13
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:112
-; GFX11-NEXT: v_readlane_b32 s49, v34, 9
-; GFX11-NEXT: v_readlane_b32 s48, v34, 8
-; GFX11-NEXT: v_readlane_b32 s38, v34, 6
-; GFX11-NEXT: v_readlane_b32 s37, v34, 5
-; GFX11-NEXT: v_readlane_b32 s36, v34, 4
-; GFX11-NEXT: v_readlane_b32 s35, v34, 3
-; GFX11-NEXT: v_readlane_b32 s34, v34, 2
+; GFX11-NEXT: v_readlane_b32 s49, v34, 7
+; GFX11-NEXT: v_readlane_b32 s48, v34, 6
+; GFX11-NEXT: v_readlane_b32 s38, v34, 4
+; GFX11-NEXT: v_readlane_b32 s37, v34, 3
+; GFX11-NEXT: v_readlane_b32 s36, v34, 2
+; GFX11-NEXT: v_readlane_b32 s35, v34, 1
+; GFX11-NEXT: v_readlane_b32 s34, v34, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v34, off, s32
@@ -21918,70 +21918,69 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v33, s30, 0
-; SI-NEXT: v_writelane_b32 v33, s31, 1
-; SI-NEXT: v_writelane_b32 v33, s34, 2
-; SI-NEXT: v_writelane_b32 v33, s35, 3
-; SI-NEXT: v_writelane_b32 v33, s36, 4
-; SI-NEXT: v_writelane_b32 v33, s37, 5
-; SI-NEXT: v_writelane_b32 v33, s38, 6
-; SI-NEXT: v_writelane_b32 v33, s39, 7
-; SI-NEXT: v_writelane_b32 v33, s48, 8
-; SI-NEXT: v_writelane_b32 v33, s49, 9
-; SI-NEXT: v_writelane_b32 v33, s50, 10
-; SI-NEXT: v_writelane_b32 v33, s51, 11
-; SI-NEXT: v_writelane_b32 v33, s52, 12
-; SI-NEXT: v_writelane_b32 v33, s53, 13
-; SI-NEXT: v_writelane_b32 v33, s54, 14
-; SI-NEXT: v_writelane_b32 v33, s55, 15
-; SI-NEXT: v_writelane_b32 v33, s64, 16
-; SI-NEXT: v_writelane_b32 v33, s65, 17
-; SI-NEXT: v_writelane_b32 v33, s66, 18
-; SI-NEXT: v_writelane_b32 v33, s67, 19
-; SI-NEXT: v_writelane_b32 v33, s68, 20
-; SI-NEXT: v_writelane_b32 v33, s69, 21
+; SI-NEXT: v_writelane_b32 v33, s34, 0
+; SI-NEXT: v_writelane_b32 v33, s35, 1
+; SI-NEXT: v_writelane_b32 v33, s36, 2
+; SI-NEXT: v_writelane_b32 v33, s37, 3
+; SI-NEXT: v_writelane_b32 v33, s38, 4
+; SI-NEXT: v_writelane_b32 v33, s39, 5
+; SI-NEXT: v_writelane_b32 v33, s48, 6
+; SI-NEXT: v_writelane_b32 v33, s49, 7
+; SI-NEXT: v_writelane_b32 v33, s50, 8
+; SI-NEXT: v_writelane_b32 v33, s51, 9
+; SI-NEXT: v_writelane_b32 v33, s52, 10
+; SI-NEXT: v_writelane_b32 v33, s53, 11
+; SI-NEXT: v_writelane_b32 v33, s54, 12
+; SI-NEXT: v_writelane_b32 v33, s55, 13
+; SI-NEXT: v_writelane_b32 v33, s64, 14
+; SI-NEXT: v_writelane_b32 v33, s65, 15
+; SI-NEXT: v_writelane_b32 v33, s66, 16
+; SI-NEXT: v_writelane_b32 v33, s67, 17
+; SI-NEXT: v_writelane_b32 v33, s68, 18
+; SI-NEXT: v_writelane_b32 v33, s69, 19
+; SI-NEXT: v_writelane_b32 v33, s70, 20
; SI-NEXT: v_mov_b32_e32 v19, s16
-; SI-NEXT: v_writelane_b32 v33, s70, 22
+; SI-NEXT: v_writelane_b32 v33, s71, 21
; SI-NEXT: v_readfirstlane_b32 s48, v19
; SI-NEXT: v_mov_b32_e32 v19, s17
-; SI-NEXT: v_writelane_b32 v33, s71, 23
+; SI-NEXT: v_writelane_b32 v33, s80, 22
; SI-NEXT: v_readfirstlane_b32 s49, v19
; SI-NEXT: v_mov_b32_e32 v19, s18
-; SI-NEXT: v_writelane_b32 v33, s80, 24
+; SI-NEXT: v_writelane_b32 v33, s81, 23
; SI-NEXT: v_readfirstlane_b32 s50, v19
; SI-NEXT: v_mov_b32_e32 v19, s19
-; SI-NEXT: v_writelane_b32 v33, s81, 25
+; SI-NEXT: v_writelane_b32 v33, s82, 24
; SI-NEXT: v_readfirstlane_b32 s51, v19
; SI-NEXT: v_mov_b32_e32 v19, s20
-; SI-NEXT: v_writelane_b32 v33, s82, 26
+; SI-NEXT: v_writelane_b32 v33, s83, 25
; SI-NEXT: v_readfirstlane_b32 s52, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
-; SI-NEXT: v_writelane_b32 v33, s83, 27
+; SI-NEXT: v_writelane_b32 v33, s84, 26
; SI-NEXT: v_readfirstlane_b32 s53, v19
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v33, s84, 28
+; SI-NEXT: v_writelane_b32 v33, s85, 27
; SI-NEXT: v_readfirstlane_b32 s54, v19
; SI-NEXT: v_mov_b32_e32 v19, s23
-; SI-NEXT: v_writelane_b32 v33, s85, 29
+; SI-NEXT: v_writelane_b32 v33, s86, 28
; SI-NEXT: v_readfirstlane_b32 s55, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
-; SI-NEXT: v_writelane_b32 v33, s86, 30
+; SI-NEXT: v_writelane_b32 v33, s87, 29
; SI-NEXT: v_readfirstlane_b32 s64, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
-; SI-NEXT: v_writelane_b32 v33, s87, 31
+; SI-NEXT: v_writelane_b32 v33, s96, 30
; SI-NEXT: v_readfirstlane_b32 s65, v19
; SI-NEXT: v_mov_b32_e32 v19, s26
-; SI-NEXT: v_writelane_b32 v33, s96, 32
+; SI-NEXT: v_writelane_b32 v33, s97, 31
; SI-NEXT: v_readfirstlane_b32 s66, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
-; SI-NEXT: v_writelane_b32 v33, s97, 33
+; SI-NEXT: v_writelane_b32 v33, s98, 32
; SI-NEXT: v_readfirstlane_b32 s67, v19
; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_writelane_b32 v33, s98, 34
+; SI-NEXT: v_writelane_b32 v33, s99, 33
; SI-NEXT: v_readfirstlane_b32 s68, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT: v_writelane_b32 v33, s99, 35
+; SI-NEXT: v_writelane_b32 v33, s30, 34
; SI-NEXT: v_readfirstlane_b32 s69, v19
; SI-NEXT: v_readfirstlane_b32 s70, v0
; SI-NEXT: v_readfirstlane_b32 s71, v1
@@ -22002,6 +22001,7 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: v_readfirstlane_b32 s8, v16
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s9, v17
+; SI-NEXT: v_writelane_b32 v33, s31, 35
; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB17_2
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -22411,43 +22411,43 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: v_mul_f32_e64 v31, 1.0, s5
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31
; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT: v_readlane_b32 s30, v33, 34
; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT: v_readlane_b32 s99, v33, 35
-; SI-NEXT: v_readlane_b32 s98, v33, 34
-; SI-NEXT: v_readlane_b32 s97, v33, 33
-; SI-NEXT: v_readlane_b32 s96, v33, 32
-; SI-NEXT: v_readlane_b32 s87, v33, 31
-; SI-NEXT: v_readlane_b32 s86, v33, 30
-; SI-NEXT: v_readlane_b32 s85, v33, 29
-; SI-NEXT: v_readlane_b32 s84, v33, 28
-; SI-NEXT: v_readlane_b32 s83, v33, 27
-; SI-NEXT: v_readlane_b32 s82, v33, 26
-; SI-NEXT: v_readlane_b32 s81, v33, 25
-; SI-NEXT: v_readlane_b32 s80, v33, 24
-; SI-NEXT: v_readlane_b32 s71, v33, 23
-; SI-NEXT: v_readlane_b32 s70, v33, 22
-; SI-NEXT: v_readlane_b32 s69, v33, 21
-; SI-NEXT: v_readlane_b32 s68, v33, 20
-; SI-NEXT: v_readlane_b32 s67, v33, 19
-; SI-NEXT: v_readlane_b32 s66, v33, 18
-; SI-NEXT: v_readlane_b32 s65, v33, 17
-; SI-NEXT: v_readlane_b32 s64, v33, 16
-; SI-NEXT: v_readlane_b32 s55, v33, 15
-; SI-NEXT: v_readlane_b32 s54, v33, 14
-; SI-NEXT: v_readlane_b32 s53, v33, 13
-; SI-NEXT: v_readlane_b32 s52, v33, 12
-; SI-NEXT: v_readlane_b32 s51, v33, 11
-; SI-NEXT: v_readlane_b32 s50, v33, 10
-; SI-NEXT: v_readlane_b32 s49, v33, 9
-; SI-NEXT: v_readlane_b32 s48, v33, 8
-; SI-NEXT: v_readlane_b32 s39, v33, 7
-; SI-NEXT: v_readlane_b32 s38, v33, 6
-; SI-NEXT: v_readlane_b32 s37, v33, 5
-; SI-NEXT: v_readlane_b32 s36, v33, 4
-; SI-NEXT: v_readlane_b32 s35, v33, 3
-; SI-NEXT: v_readlane_b32 s34, v33, 2
-; SI-NEXT: v_readlane_b32 s31, v33, 1
-; SI-NEXT: v_readlane_b32 s30, v33, 0
+; SI-NEXT: v_readlane_b32 s31, v33, 35
+; SI-NEXT: v_readlane_b32 s99, v33, 33
+; SI-NEXT: v_readlane_b32 s98, v33, 32
+; SI-NEXT: v_readlane_b32 s97, v33, 31
+; SI-NEXT: v_readlane_b32 s96, v33, 30
+; SI-NEXT: v_readlane_b32 s87, v33, 29
+; SI-NEXT: v_readlane_b32 s86, v33, 28
+; SI-NEXT: v_readlane_b32 s85, v33, 27
+; SI-NEXT: v_readlane_b32 s84, v33, 26
+; SI-NEXT: v_readlane_b32 s83, v33, 25
+; SI-NEXT: v_readlane_b32 s82, v33, 24
+; SI-NEXT: v_readlane_b32 s81, v33, 23
+; SI-NEXT: v_readlane_b32 s80, v33, 22
+; SI-NEXT: v_readlane_b32 s71, v33, 21
+; SI-NEXT: v_readlane_b32 s70, v33, 20
+; SI-NEXT: v_readlane_b32 s69, v33, 19
+; SI-NEXT: v_readlane_b32 s68, v33, 18
+; SI-NEXT: v_readlane_b32 s67, v33, 17
+; SI-NEXT: v_readlane_b32 s66, v33, 16
+; SI-NEXT: v_readlane_b32 s65, v33, 15
+; SI-NEXT: v_readlane_b32 s64, v33, 14
+; SI-NEXT: v_readlane_b32 s55, v33, 13
+; SI-NEXT: v_readlane_b32 s54, v33, 12
+; SI-NEXT: v_readlane_b32 s53, v33, 11
+; SI-NEXT: v_readlane_b32 s52, v33, 10
+; SI-NEXT: v_readlane_b32 s51, v33, 9
+; SI-NEXT: v_readlane_b32 s50, v33, 8
+; SI-NEXT: v_readlane_b32 s49, v33, 7
+; SI-NEXT: v_readlane_b32 s48, v33, 6
+; SI-NEXT: v_readlane_b32 s39, v33, 5
+; SI-NEXT: v_readlane_b32 s38, v33, 4
+; SI-NEXT: v_readlane_b32 s37, v33, 3
+; SI-NEXT: v_readlane_b32 s36, v33, 2
+; SI-NEXT: v_readlane_b32 s35, v33, 1
+; SI-NEXT: v_readlane_b32 s34, v33, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -29632,55 +29632,55 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v32, s30, 0
-; SI-NEXT: v_writelane_b32 v32, s31, 1
-; SI-NEXT: v_writelane_b32 v32, s34, 2
-; SI-NEXT: v_writelane_b32 v32, s35, 3
-; SI-NEXT: v_writelane_b32 v32, s36, 4
-; SI-NEXT: v_writelane_b32 v32, s37, 5
-; SI-NEXT: v_writelane_b32 v32, s38, 6
+; SI-NEXT: v_writelane_b32 v32, s34, 0
+; SI-NEXT: v_writelane_b32 v32, s35, 1
+; SI-NEXT: v_writelane_b32 v32, s36, 2
+; SI-NEXT: v_writelane_b32 v32, s37, 3
+; SI-NEXT: v_writelane_b32 v32, s38, 4
+; SI-NEXT: v_writelane_b32 v32, s39, 5
+; SI-NEXT: v_writelane_b32 v32, s48, 6
; SI-NEXT: v_mov_b32_e32 v19, s16
-; SI-NEXT: v_writelane_b32 v32, s39, 7
+; SI-NEXT: v_writelane_b32 v32, s49, 7
; SI-NEXT: v_readfirstlane_b32 s56, v19
; SI-NEXT: v_mov_b32_e32 v19, s17
-; SI-NEXT: v_writelane_b32 v32, s48, 8
+; SI-NEXT: v_writelane_b32 v32, s50, 8
; SI-NEXT: v_readfirstlane_b32 s57, v19
; SI-NEXT: v_mov_b32_e32 v19, s18
-; SI-NEXT: v_writelane_b32 v32, s49, 9
+; SI-NEXT: v_writelane_b32 v32, s51, 9
; SI-NEXT: v_readfirstlane_b32 s46, v19
; SI-NEXT: v_mov_b32_e32 v19, s19
-; SI-NEXT: v_writelane_b32 v32, s50, 10
+; SI-NEXT: v_writelane_b32 v32, s52, 10
; SI-NEXT: v_readfirstlane_b32 s47, v19
; SI-NEXT: v_mov_b32_e32 v19, s20
-; SI-NEXT: v_writelane_b32 v32, s51, 11
+; SI-NEXT: v_writelane_b32 v32, s53, 11
; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
-; SI-NEXT: v_writelane_b32 v32, s52, 12
+; SI-NEXT: v_writelane_b32 v32, s54, 12
; SI-NEXT: v_readfirstlane_b32 s45, v19
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v32, s53, 13
+; SI-NEXT: v_writelane_b32 v32, s55, 13
; SI-NEXT: v_readfirstlane_b32 s42, v19
; SI-NEXT: v_mov_b32_e32 v19, s23
-; SI-NEXT: v_writelane_b32 v32, s54, 14
+; SI-NEXT: v_writelane_b32 v32, s64, 14
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
-; SI-NEXT: v_writelane_b32 v32, s55, 15
+; SI-NEXT: v_writelane_b32 v32, s65, 15
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
-; SI-NEXT: v_writelane_b32 v32, s64, 16
+; SI-NEXT: v_writelane_b32 v32, s66, 16
; SI-NEXT: v_readfirstlane_b32 s41, v19
; SI-NEXT: v_mov_b32_e32 v19, s26
-; SI-NEXT: v_writelane_b32 v32, s65, 17
+; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
-; SI-NEXT: v_writelane_b32 v32, s66, 18
+; SI-NEXT: v_writelane_b32 v32, s68, 18
; SI-NEXT: v_readfirstlane_b32 s25, v19
; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_writelane_b32 v32, s67, 19
+; SI-NEXT: v_writelane_b32 v32, s69, 19
; SI-NEXT: v_readfirstlane_b32 s22, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT: v_writelane_b32 v32, s68, 20
+; SI-NEXT: v_writelane_b32 v32, s30, 20
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v0
; SI-NEXT: v_readfirstlane_b32 s21, v1
@@ -29701,7 +29701,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v17
-; SI-NEXT: v_writelane_b32 v32, s69, 21
+; SI-NEXT: v_writelane_b32 v32, s31, 21
; SI-NEXT: s_cbranch_scc0 .LBB21_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 16
@@ -29899,6 +29899,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s56
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v32, 20
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s46
@@ -29931,28 +29932,27 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v29, s7
; SI-NEXT: v_mov_b32_e32 v30, s4
; SI-NEXT: v_mov_b32_e32 v31, s5
-; SI-NEXT: v_readlane_b32 s69, v32, 21
-; SI-NEXT: v_readlane_b32 s68, v32, 20
-; SI-NEXT: v_readlane_b32 s67, v32, 19
-; SI-NEXT: v_readlane_b32 s66, v32, 18
-; SI-NEXT: v_readlane_b32 s65, v32, 17
-; SI-NEXT: v_readlane_b32 s64, v32, 16
-; SI-NEXT: v_readlane_b32 s55, v32, 15
-; SI-NEXT: v_readlane_b32 s54, v32, 14
-; SI-NEXT: v_readlane_b32 s53, v32, 13
-; SI-NEXT: v_readlane_b32 s52, v32, 12
-; SI-NEXT: v_readlane_b32 s51, v32, 11
-; SI-NEXT: v_readlane_b32 s50, v32, 10
-; SI-NEXT: v_readlane_b32 s49, v32, 9
-; SI-NEXT: v_readlane_b32 s48, v32, 8
-; SI-NEXT: v_readlane_b32 s39, v32, 7
-; SI-NEXT: v_readlane_b32 s38, v32, 6
-; SI-NEXT: v_readlane_b32 s37, v32, 5
-; SI-NEXT: v_readlane_b32 s36, v32, 4
-; SI-NEXT: v_readlane_b32 s35, v32, 3
-; SI-NEXT: v_readlane_b32 s34, v32, 2
-; SI-NEXT: v_readlane_b32 s31, v32, 1
-; SI-NEXT: v_readlane_b32 s30, v32, 0
+; SI-NEXT: v_readlane_b32 s31, v32, 21
+; SI-NEXT: v_readlane_b32 s69, v32, 19
+; SI-NEXT: v_readlane_b32 s68, v32, 18
+; SI-NEXT: v_readlane_b32 s67, v32, 17
+; SI-NEXT: v_readlane_b32 s66, v32, 16
+; SI-NEXT: v_readlane_b32 s65, v32, 15
+; SI-NEXT: v_readlane_b32 s64, v32, 14
+; SI-NEXT: v_readlane_b32 s55, v32, 13
+; SI-NEXT: v_readlane_b32 s54, v32, 12
+; SI-NEXT: v_readlane_b32 s53, v32, 11
+; SI-NEXT: v_readlane_b32 s52, v32, 10
+; SI-NEXT: v_readlane_b32 s51, v32, 9
+; SI-NEXT: v_readlane_b32 s50, v32, 8
+; SI-NEXT: v_readlane_b32 s49, v32, 7
+; SI-NEXT: v_readlane_b32 s48, v32, 6
+; SI-NEXT: v_readlane_b32 s39, v32, 5
+; SI-NEXT: v_readlane_b32 s38, v32, 4
+; SI-NEXT: v_readlane_b32 s37, v32, 3
+; SI-NEXT: v_readlane_b32 s36, v32, 2
+; SI-NEXT: v_readlane_b32 s35, v32, 1
+; SI-NEXT: v_readlane_b32 s34, v32, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -32481,55 +32481,55 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v32, s30, 0
-; SI-NEXT: v_writelane_b32 v32, s31, 1
-; SI-NEXT: v_writelane_b32 v32, s34, 2
-; SI-NEXT: v_writelane_b32 v32, s35, 3
-; SI-NEXT: v_writelane_b32 v32, s36, 4
-; SI-NEXT: v_writelane_b32 v32, s37, 5
-; SI-NEXT: v_writelane_b32 v32, s38, 6
+; SI-NEXT: v_writelane_b32 v32, s34, 0
+; SI-NEXT: v_writelane_b32 v32, s35, 1
+; SI-NEXT: v_writelane_b32 v32, s36, 2
+; SI-NEXT: v_writelane_b32 v32, s37, 3
+; SI-NEXT: v_writelane_b32 v32, s38, 4
+; SI-NEXT: v_writelane_b32 v32, s39, 5
+; SI-NEXT: v_writelane_b32 v32, s48, 6
; SI-NEXT: v_mov_b32_e32 v19, s16
-; SI-NEXT: v_writelane_b32 v32, s39, 7
+; SI-NEXT: v_writelane_b32 v32, s49, 7
; SI-NEXT: v_readfirstlane_b32 s56, v19
; SI-NEXT: v_mov_b32_e32 v19, s17
-; SI-NEXT: v_writelane_b32 v32, s48, 8
+; SI-NEXT: v_writelane_b32 v32, s50, 8
; SI-NEXT: v_readfirstlane_b32 s57, v19
; SI-NEXT: v_mov_b32_e32 v19, s18
-; SI-NEXT: v_writelane_b32 v32, s49, 9
+; SI-NEXT: v_writelane_b32 v32, s51, 9
; SI-NEXT: v_readfirstlane_b32 s46, v19
; SI-NEXT: v_mov_b32_e32 v19, s19
-; SI-NEXT: v_writelane_b32 v32, s50, 10
+; SI-NEXT: v_writelane_b32 v32, s52, 10
; SI-NEXT: v_readfirstlane_b32 s47, v19
; SI-NEXT: v_mov_b32_e32 v19, s20
-; SI-NEXT: v_writelane_b32 v32, s51, 11
+; SI-NEXT: v_writelane_b32 v32, s53, 11
; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
-; SI-NEXT: v_writelane_b32 v32, s52, 12
+; SI-NEXT: v_writelane_b32 v32, s54, 12
; SI-NEXT: v_readfirstlane_b32 s45, v19
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v32, s53, 13
+; SI-NEXT: v_writelane_b32 v32, s55, 13
; SI-NEXT: v_readfirstlane_b32 s42, v19
; SI-NEXT: v_mov_b32_e32 v19, s23
-; SI-NEXT: v_writelane_b32 v32, s54, 14
+; SI-NEXT: v_writelane_b32 v32, s64, 14
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
-; SI-NEXT: v_writelane_b32 v32, s55, 15
+; SI-NEXT: v_writelane_b32 v32, s65, 15
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
-; SI-NEXT: v_writelane_b32 v32, s64, 16
+; SI-NEXT: v_writelane_b32 v32, s66, 16
; SI-NEXT: v_readfirstlane_b32 s41, v19
; SI-NEXT: v_mov_b32_e32 v19, s26
-; SI-NEXT: v_writelane_b32 v32, s65, 17
+; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
-; SI-NEXT: v_writelane_b32 v32, s66, 18
+; SI-NEXT: v_writelane_b32 v32, s68, 18
; SI-NEXT: v_readfirstlane_b32 s25, v19
; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_writelane_b32 v32, s67, 19
+; SI-NEXT: v_writelane_b32 v32, s69, 19
; SI-NEXT: v_readfirstlane_b32 s22, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT: v_writelane_b32 v32, s68, 20
+; SI-NEXT: v_writelane_b32 v32, s30, 20
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v0
; SI-NEXT: v_readfirstlane_b32 s21, v1
@@ -32550,7 +32550,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v17
-; SI-NEXT: v_writelane_b32 v32, s69, 21
+; SI-NEXT: v_writelane_b32 v32, s31, 21
; SI-NEXT: s_cbranch_scc0 .LBB25_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 16
@@ -32748,6 +32748,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s56
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v32, 20
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s46
@@ -32780,28 +32781,27 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v29, s7
; SI-NEXT: v_mov_b32_e32 v30, s4
; SI-NEXT: v_mov_b32_e32 v31, s5
-; SI-NEXT: v_readlane_b32 s69, v32, 21
-; SI-NEXT: v_readlane_b32 s68, v32, 20
-; SI-NEXT: v_readlane_b32 s67, v32, 19
-; SI-NEXT: v_readlane_b32 s66, v32, 18
-; SI-NEXT: v_readlane_b32 s65, v32, 17
-; SI-NEXT: v_readlane_b32 s64, v32, 16
-; SI-NEXT: v_readlane_b32 s55, v32, 15
-; SI-NEXT: v_readlane_b32 s54, v32, 14
-; SI-NEXT: v_readlane_b32 s53, v32, 13
-; SI-NEXT: v_readlane_b32 s52, v32, 12
-; SI-NEXT: v_readlane_b32 s51, v32, 11
-; SI-NEXT: v_readlane_b32 s50, v32, 10
-; SI-NEXT: v_readlane_b32 s49, v32, 9
-; SI-NEXT: v_readlane_b32 s48, v32, 8
-; SI-NEXT: v_readlane_b32 s39, v32, 7
-; SI-NEXT: v_readlane_b32 s38, v32, 6
-; SI-NEXT: v_readlane_b32 s37, v32, 5
-; SI-NEXT: v_readlane_b32 s36, v32, 4
-; SI-NEXT: v_readlane_b32 s35, v32, 3
-; SI-NEXT: v_readlane_b32 s34, v32, 2
-; SI-NEXT: v_readlane_b32 s31, v32, 1
-; SI-NEXT: v_readlane_b32 s30, v32, 0
+; SI-NEXT: v_readlane_b32 s31, v32, 21
+; SI-NEXT: v_readlane_b32 s69, v32, 19
+; SI-NEXT: v_readlane_b32 s68, v32, 18
+; SI-NEXT: v_readlane_b32 s67, v32, 17
+; SI-NEXT: v_readlane_b32 s66, v32, 16
+; SI-NEXT: v_readlane_b32 s65, v32, 15
+; SI-NEXT: v_readlane_b32 s64, v32, 14
+; SI-NEXT: v_readlane_b32 s55, v32, 13
+; SI-NEXT: v_readlane_b32 s54, v32, 12
+; SI-NEXT: v_readlane_b32 s53, v32, 11
+; SI-NEXT: v_readlane_b32 s52, v32, 10
+; SI-NEXT: v_readlane_b32 s51, v32, 9
+; SI-NEXT: v_readlane_b32 s50, v32, 8
+; SI-NEXT: v_readlane_b32 s49, v32, 7
+; SI-NEXT: v_readlane_b32 s48, v32, 6
+; SI-NEXT: v_readlane_b32 s39, v32, 5
+; SI-NEXT: v_readlane_b32 s38, v32, 4
+; SI-NEXT: v_readlane_b32 s37, v32, 3
+; SI-NEXT: v_readlane_b32 s36, v32, 2
+; SI-NEXT: v_readlane_b32 s35, v32, 1
+; SI-NEXT: v_readlane_b32 s34, v32, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -40580,70 +40580,70 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v63, s30, 0
-; SI-NEXT: v_writelane_b32 v63, s31, 1
-; SI-NEXT: v_writelane_b32 v63, s34, 2
-; SI-NEXT: v_writelane_b32 v63, s35, 3
-; SI-NEXT: v_writelane_b32 v63, s36, 4
-; SI-NEXT: v_writelane_b32 v63, s37, 5
-; SI-NEXT: v_writelane_b32 v63, s38, 6
-; SI-NEXT: v_writelane_b32 v63, s39, 7
-; SI-NEXT: v_writelane_b32 v63, s48, 8
-; SI-NEXT: v_writelane_b32 v63, s49, 9
-; SI-NEXT: v_writelane_b32 v63, s50, 10
-; SI-NEXT: v_writelane_b32 v63, s51, 11
-; SI-NEXT: v_writelane_b32 v63, s52, 12
-; SI-NEXT: v_writelane_b32 v63, s53, 13
-; SI-NEXT: v_writelane_b32 v63, s54, 14
-; SI-NEXT: v_writelane_b32 v63, s55, 15
-; SI-NEXT: v_writelane_b32 v63, s64, 16
-; SI-NEXT: v_writelane_b32 v63, s65, 17
-; SI-NEXT: v_writelane_b32 v63, s66, 18
-; SI-NEXT: v_writelane_b32 v63, s67, 19
-; SI-NEXT: v_writelane_b32 v63, s68, 20
-; SI-NEXT: v_writelane_b32 v63, s69, 21
+; SI-NEXT: v_writelane_b32 v63, s34, 0
+; SI-NEXT: v_writelane_b32 v63, s35, 1
+; SI-NEXT: v_writelane_b32 v63, s36, 2
+; SI-NEXT: v_writelane_b32 v63, s37, 3
+; SI-NEXT: v_writelane_b32 v63, s38, 4
+; SI-NEXT: v_writelane_b32 v63, s39, 5
+; SI-NEXT: v_writelane_b32 v63, s48, 6
+; SI-NEXT: v_writelane_b32 v63, s49, 7
+; SI-NEXT: v_writelane_b32 v63, s50, 8
+; SI-NEXT: v_writelane_b32 v63, s51, 9
+; SI-NEXT: v_writelane_b32 v63, s52, 10
+; SI-NEXT: v_writelane_b32 v63, s53, 11
+; SI-NEXT: v_writelane_b32 v63, s54, 12
+; SI-NEXT: v_writelane_b32 v63, s55, 13
+; SI-NEXT: v_writelane_b32 v63, s64, 14
+; SI-NEXT: v_writelane_b32 v63, s65, 15
+; SI-NEXT: v_writelane_b32 v63, s66, 16
+; SI-NEXT: v_writelane_b32 v63, s67, 17
+; SI-NEXT: v_writelane_b32 v63, s68, 18
+; SI-NEXT: v_writelane_b32 v63, s69, 19
+; SI-NEXT: v_writelane_b32 v63, s70, 20
+; SI-NEXT: v_writelane_b32 v63, s71, 21
; SI-NEXT: v_mov_b32_e32 v20, s16
-; SI-NEXT: v_writelane_b32 v63, s70, 22
+; SI-NEXT: v_writelane_b32 v63, s80, 22
; SI-NEXT: v_readfirstlane_b32 s58, v20
; SI-NEXT: v_mov_b32_e32 v20, s17
-; SI-NEXT: v_writelane_b32 v63, s71, 23
+; SI-NEXT: v_writelane_b32 v63, s81, 23
; SI-NEXT: v_readfirstlane_b32 s59, v20
; SI-NEXT: v_mov_b32_e32 v20, s18
-; SI-NEXT: v_writelane_b32 v63, s80, 24
+; SI-NEXT: v_writelane_b32 v63, s82, 24
; SI-NEXT: v_readfirstlane_b32 s56, v20
; SI-NEXT: v_mov_b32_e32 v20, s19
-; SI-NEXT: v_writelane_b32 v63, s81, 25
+; SI-NEXT: v_writelane_b32 v63, s83, 25
; SI-NEXT: v_readfirstlane_b32 s57, v20
; SI-NEXT: v_mov_b32_e32 v20, s20
-; SI-NEXT: v_writelane_b32 v63, s82, 26
+; SI-NEXT: v_writelane_b32 v63, s84, 26
; SI-NEXT: v_readfirstlane_b32 s46, v20
; SI-NEXT: v_mov_b32_e32 v20, s21
-; SI-NEXT: v_writelane_b32 v63, s83, 27
+; SI-NEXT: v_writelane_b32 v63, s85, 27
; SI-NEXT: v_readfirstlane_b32 s47, v20
; SI-NEXT: v_mov_b32_e32 v20, s22
-; SI-NEXT: v_writelane_b32 v63, s84, 28
+; SI-NEXT: v_writelane_b32 v63, s86, 28
; SI-NEXT: v_readfirstlane_b32 s44, v20
; SI-NEXT: v_mov_b32_e32 v20, s23
-; SI-NEXT: v_writelane_b32 v63, s85, 29
+; SI-NEXT: v_writelane_b32 v63, s87, 29
; SI-NEXT: v_readfirstlane_b32 s45, v20
; SI-NEXT: v_mov_b32_e32 v20, s24
-; SI-NEXT: v_writelane_b32 v63, s86, 30
+; SI-NEXT: v_writelane_b32 v63, s96, 30
; SI-NEXT: v_readfirstlane_b32 s42, v20
; SI-NEXT: v_mov_b32_e32 v20, s25
-; SI-NEXT: v_writelane_b32 v63, s87, 31
+; SI-NEXT: v_writelane_b32 v63, s97, 31
; SI-NEXT: v_readfirstlane_b32 s43, v20
; SI-NEXT: v_mov_b32_e32 v20, s26
-; SI-NEXT: v_writelane_b32 v63, s96, 32
+; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_readfirstlane_b32 s40, v20
; SI-NEXT: v_mov_b32_e32 v20, s27
-; SI-NEXT: v_writelane_b32 v63, s97, 33
+; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_readfirstlane_b32 s41, v20
; SI-NEXT: v_mov_b32_e32 v20, s28
-; SI-NEXT: v_writelane_b32 v63, s98, 34
+; SI-NEXT: v_writelane_b32 v63, s30, 34
; SI-NEXT: v_readfirstlane_b32 s24, v20
; SI-NEXT: v_mov_b32_e32 v20, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT: v_writelane_b32 v63, s99, 35
+; SI-NEXT: v_writelane_b32 v63, s31, 35
; SI-NEXT: v_readfirstlane_b32 s25, v20
; SI-NEXT: v_readfirstlane_b32 s22, v1
; SI-NEXT: v_readfirstlane_b32 s23, v2
@@ -41567,34 +41567,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: s_lshl_b32 s4, s40, 8
; SI-NEXT: s_lshl_b32 s5, s52, 24
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: v_readlane_b32 s99, v63, 35
-; SI-NEXT: v_readlane_b32 s98, v63, 34
-; SI-NEXT: v_readlane_b32 s97, v63, 33
-; SI-NEXT: v_readlane_b32 s87, v63, 31
-; SI-NEXT: v_readlane_b32 s86, v63, 30
-; SI-NEXT: v_readlane_b32 s85, v63, 29
-; SI-NEXT: v_readlane_b32 s84, v63, 28
-; SI-NEXT: v_readlane_b32 s83, v63, 27
-; SI-NEXT: v_readlane_b32 s82, v63, 26
-; SI-NEXT: v_readlane_b32 s81, v63, 25
-; SI-NEXT: v_readlane_b32 s80, v63, 24
-; SI-NEXT: v_readlane_b32 s71, v63, 23
-; SI-NEXT: v_readlane_b32 s70, v63, 22
-; SI-NEXT: v_readlane_b32 s69, v63, 21
-; SI-NEXT: v_readlane_b32 s68, v63, 20
-; SI-NEXT: v_readlane_b32 s67, v63, 19
-; SI-NEXT: v_readlane_b32 s66, v63, 18
-; SI-NEXT: v_readlane_b32 s65, v63, 17
-; SI-NEXT: v_readlane_b32 s64, v63, 16
-; SI-NEXT: v_readlane_b32 s55, v63, 15
-; SI-NEXT: v_readlane_b32 s53, v63, 13
-; SI-NEXT: v_readlane_b32 s52, v63, 12
-; SI-NEXT: v_readlane_b32 s51, v63, 11
-; SI-NEXT: v_readlane_b32 s49, v63, 9
-; SI-NEXT: v_readlane_b32 s39, v63, 7
-; SI-NEXT: v_readlane_b32 s37, v63, 5
-; SI-NEXT: v_readlane_b32 s35, v63, 3
-; SI-NEXT: v_readlane_b32 s31, v63, 1
+; SI-NEXT: v_readlane_b32 s99, v63, 33
+; SI-NEXT: v_readlane_b32 s98, v63, 32
+; SI-NEXT: v_readlane_b32 s97, v63, 31
+; SI-NEXT: v_readlane_b32 s87, v63, 29
+; SI-NEXT: v_readlane_b32 s86, v63, 28
+; SI-NEXT: v_readlane_b32 s85, v63, 27
+; SI-NEXT: v_readlane_b32 s84, v63, 26
+; SI-NEXT: v_readlane_b32 s83, v63, 25
+; SI-NEXT: v_readlane_b32 s82, v63, 24
+; SI-NEXT: v_readlane_b32 s81, v63, 23
+; SI-NEXT: v_readlane_b32 s80, v63, 22
+; SI-NEXT: v_readlane_b32 s71, v63, 21
+; SI-NEXT: v_readlane_b32 s70, v63, 20
+; SI-NEXT: v_readlane_b32 s69, v63, 19
+; SI-NEXT: v_readlane_b32 s68, v63, 18
+; SI-NEXT: v_readlane_b32 s67, v63, 17
+; SI-NEXT: v_readlane_b32 s66, v63, 16
+; SI-NEXT: v_readlane_b32 s65, v63, 15
+; SI-NEXT: v_readlane_b32 s64, v63, 14
+; SI-NEXT: v_readlane_b32 s55, v63, 13
+; SI-NEXT: v_readlane_b32 s53, v63, 11
+; SI-NEXT: v_readlane_b32 s52, v63, 10
+; SI-NEXT: v_readlane_b32 s51, v63, 9
+; SI-NEXT: v_readlane_b32 s49, v63, 7
+; SI-NEXT: v_readlane_b32 s39, v63, 5
+; SI-NEXT: v_readlane_b32 s37, v63, 3
+; SI-NEXT: v_readlane_b32 s35, v63, 1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; SI-NEXT: v_or_b32_e32 v2, v6, v2
@@ -41616,8 +41615,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_and_b32_e32 v2, 0xff, v21
; SI-NEXT: s_lshl_b32 s4, s24, 8
; SI-NEXT: s_lshl_b32 s5, s48, 24
-; SI-NEXT: v_readlane_b32 s54, v63, 14
-; SI-NEXT: v_readlane_b32 s48, v63, 8
+; SI-NEXT: v_readlane_b32 s54, v63, 12
+; SI-NEXT: v_readlane_b32 s48, v63, 6
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
@@ -41649,8 +41648,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: s_lshl_b32 s4, s22, 8
; SI-NEXT: s_lshl_b32 s5, s36, 24
-; SI-NEXT: v_readlane_b32 s50, v63, 10
-; SI-NEXT: v_readlane_b32 s36, v63, 4
+; SI-NEXT: v_readlane_b32 s50, v63, 8
+; SI-NEXT: v_readlane_b32 s36, v63, 2
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v4
@@ -41681,8 +41680,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: s_lshl_b32 s4, s20, 8
; SI-NEXT: s_lshl_b32 s5, s30, 24
-; SI-NEXT: v_readlane_b32 s38, v63, 6
-; SI-NEXT: v_readlane_b32 s30, v63, 0
+; SI-NEXT: v_readlane_b32 s30, v63, 34
+; SI-NEXT: v_readlane_b32 s31, v63, 35
+; SI-NEXT: v_readlane_b32 s38, v63, 4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v4
@@ -41713,7 +41713,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_lshl_b32 s4, s18, 8
; SI-NEXT: s_lshl_b32 s5, s94, 24
-; SI-NEXT: v_readlane_b32 s34, v63, 2
+; SI-NEXT: v_readlane_b32 s34, v63, 0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v4
@@ -41744,7 +41744,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v34
; SI-NEXT: s_lshl_b32 s4, s16, 8
; SI-NEXT: s_lshl_b32 s5, s90, 24
-; SI-NEXT: v_readlane_b32 s96, v63, 32
+; SI-NEXT: v_readlane_b32 s96, v63, 30
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v4
@@ -43774,72 +43774,72 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v74, s30, 0
-; GFX11-NEXT: v_writelane_b32 v75, s96, 0
+; GFX11-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-NEXT: v_writelane_b32 v75, s98, 0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: v_writelane_b32 v74, s31, 1
-; GFX11-NEXT: v_writelane_b32 v75, s97, 1
+; GFX11-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-NEXT: v_writelane_b32 v75, s99, 1
; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT: v_writelane_b32 v74, s34, 2
-; GFX11-NEXT: v_writelane_b32 v75, s98, 2
+; GFX11-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-NEXT: v_writelane_b32 v75, s100, 2
; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21
; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23
-; GFX11-NEXT: v_writelane_b32 v74, s35, 3
-; GFX11-NEXT: v_writelane_b32 v75, s99, 3
+; GFX11-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-NEXT: v_writelane_b32 v75, s101, 3
; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25
; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27
-; GFX11-NEXT: v_writelane_b32 v74, s36, 4
-; GFX11-NEXT: v_writelane_b32 v75, s100, 4
+; GFX11-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-NEXT: v_writelane_b32 v75, s102, 4
; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT: v_writelane_b32 v74, s37, 5
-; GFX11-NEXT: v_writelane_b32 v75, s101, 5
+; GFX11-NEXT: v_writelane_b32 v74, s39, 5
+; GFX11-NEXT: v_writelane_b32 v75, s103, 5
; GFX11-NEXT: v_readfirstlane_b32 s40, v16
; GFX11-NEXT: v_readfirstlane_b32 s41, v17
; GFX11-NEXT: v_readfirstlane_b32 s28, v18
-; GFX11-NEXT: v_writelane_b32 v74, s38, 6
-; GFX11-NEXT: v_writelane_b32 v75, s102, 6
+; GFX11-NEXT: v_writelane_b32 v74, s48, 6
+; GFX11-NEXT: v_writelane_b32 v75, s104, 6
; GFX11-NEXT: v_readfirstlane_b32 s29, v19
; GFX11-NEXT: v_readfirstlane_b32 s26, v20
; GFX11-NEXT: v_readfirstlane_b32 s27, v21
-; GFX11-NEXT: v_writelane_b32 v74, s39, 7
-; GFX11-NEXT: v_writelane_b32 v75, s103, 7
+; GFX11-NEXT: v_writelane_b32 v74, s49, 7
+; GFX11-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-NEXT: v_readfirstlane_b32 s24, v22
; GFX11-NEXT: v_readfirstlane_b32 s25, v23
; GFX11-NEXT: v_readfirstlane_b32 s22, v24
-; GFX11-NEXT: v_writelane_b32 v74, s48, 8
+; GFX11-NEXT: v_writelane_b32 v74, s50, 8
; GFX11-NEXT: v_readfirstlane_b32 s23, v25
; GFX11-NEXT: v_readfirstlane_b32 s20, v26
; GFX11-NEXT: v_readfirstlane_b32 s21, v27
; GFX11-NEXT: v_readfirstlane_b32 s18, v28
-; GFX11-NEXT: v_writelane_b32 v74, s49, 9
+; GFX11-NEXT: v_writelane_b32 v74, s51, 9
; GFX11-NEXT: v_readfirstlane_b32 s19, v29
; GFX11-NEXT: v_readfirstlane_b32 s16, v30
; GFX11-NEXT: v_readfirstlane_b32 s17, v31
; GFX11-NEXT: v_readfirstlane_b32 s14, v32
-; GFX11-NEXT: v_writelane_b32 v74, s50, 10
+; GFX11-NEXT: v_writelane_b32 v74, s52, 10
; GFX11-NEXT: v_readfirstlane_b32 s15, v33
; GFX11-NEXT: v_readfirstlane_b32 s12, v1
; GFX11-NEXT: v_readfirstlane_b32 s13, v2
; GFX11-NEXT: v_readfirstlane_b32 s10, v3
-; GFX11-NEXT: v_writelane_b32 v74, s51, 11
+; GFX11-NEXT: v_writelane_b32 v74, s53, 11
; GFX11-NEXT: v_readfirstlane_b32 s11, v4
; GFX11-NEXT: v_readfirstlane_b32 s0, v5
; GFX11-NEXT: v_readfirstlane_b32 s1, v6
; GFX11-NEXT: v_readfirstlane_b32 s2, v7
-; GFX11-NEXT: v_writelane_b32 v74, s52, 12
+; GFX11-NEXT: v_writelane_b32 v74, s54, 12
; GFX11-NEXT: v_readfirstlane_b32 s3, v8
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-NEXT: v_writelane_b32 v74, s53, 13
+; GFX11-NEXT: v_writelane_b32 v74, s55, 13
; GFX11-NEXT: v_readfirstlane_b32 s7, v12
; GFX11-NEXT: v_readfirstlane_b32 s8, v13
; GFX11-NEXT: v_readfirstlane_b32 s9, v14
; GFX11-NEXT: s_mov_b32 vcc_hi, 0
-; GFX11-NEXT: v_writelane_b32 v74, s54, 14
+; GFX11-NEXT: v_writelane_b32 v74, s64, 14
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
; GFX11-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68
@@ -43860,26 +43860,26 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8
; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4
; GFX11-NEXT: scratch_store_b32 off, v73, s32
-; GFX11-NEXT: v_writelane_b32 v75, s104, 8
+; GFX11-NEXT: v_writelane_b32 v75, s31, 8
; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT: v_writelane_b32 v74, s55, 15
-; GFX11-NEXT: v_writelane_b32 v74, s64, 16
-; GFX11-NEXT: v_writelane_b32 v74, s65, 17
-; GFX11-NEXT: v_writelane_b32 v74, s66, 18
-; GFX11-NEXT: v_writelane_b32 v74, s67, 19
-; GFX11-NEXT: v_writelane_b32 v74, s68, 20
-; GFX11-NEXT: v_writelane_b32 v74, s69, 21
-; GFX11-NEXT: v_writelane_b32 v74, s70, 22
-; GFX11-NEXT: v_writelane_b32 v74, s71, 23
-; GFX11-NEXT: v_writelane_b32 v74, s80, 24
-; GFX11-NEXT: v_writelane_b32 v74, s81, 25
-; GFX11-NEXT: v_writelane_b32 v74, s82, 26
-; GFX11-NEXT: v_writelane_b32 v74, s83, 27
-; GFX11-NEXT: v_writelane_b32 v74, s84, 28
-; GFX11-NEXT: v_writelane_b32 v74, s85, 29
-; GFX11-NEXT: v_writelane_b32 v74, s86, 30
-; GFX11-NEXT: v_writelane_b32 v74, s87, 31
+; GFX11-NEXT: v_writelane_b32 v74, s65, 15
+; GFX11-NEXT: v_writelane_b32 v74, s66, 16
+; GFX11-NEXT: v_writelane_b32 v74, s67, 17
+; GFX11-NEXT: v_writelane_b32 v74, s68, 18
+; GFX11-NEXT: v_writelane_b32 v74, s69, 19
+; GFX11-NEXT: v_writelane_b32 v74, s70, 20
+; GFX11-NEXT: v_writelane_b32 v74, s71, 21
+; GFX11-NEXT: v_writelane_b32 v74, s80, 22
+; GFX11-NEXT: v_writelane_b32 v74, s81, 23
+; GFX11-NEXT: v_writelane_b32 v74, s82, 24
+; GFX11-NEXT: v_writelane_b32 v74, s83, 25
+; GFX11-NEXT: v_writelane_b32 v74, s84, 26
+; GFX11-NEXT: v_writelane_b32 v74, s85, 27
+; GFX11-NEXT: v_writelane_b32 v74, s86, 28
+; GFX11-NEXT: v_writelane_b32 v74, s87, 29
+; GFX11-NEXT: v_writelane_b32 v74, s96, 30
+; GFX11-NEXT: v_writelane_b32 v74, s97, 31
; GFX11-NEXT: s_cbranch_scc0 .LBB37_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s9, 24
@@ -44609,47 +44609,47 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT: v_readlane_b32 s104, v75, 8
-; GFX11-NEXT: v_readlane_b32 s103, v75, 7
-; GFX11-NEXT: v_readlane_b32 s102, v75, 6
-; GFX11-NEXT: v_readlane_b32 s101, v75, 5
-; GFX11-NEXT: v_readlane_b32 s100, v75, 4
-; GFX11-NEXT: v_readlane_b32 s99, v75, 3
-; GFX11-NEXT: v_readlane_b32 s98, v75, 2
-; GFX11-NEXT: v_readlane_b32 s97, v75, 1
-; GFX11-NEXT: v_readlane_b32 s96, v75, 0
-; GFX11-NEXT: v_readlane_b32 s87, v74, 31
-; GFX11-NEXT: v_readlane_b32 s86, v74, 30
-; GFX11-NEXT: v_readlane_b32 s85, v74, 29
-; GFX11-NEXT: v_readlane_b32 s84, v74, 28
-; GFX11-NEXT: v_readlane_b32 s83, v74, 27
-; GFX11-NEXT: v_readlane_b32 s82, v74, 26
-; GFX11-NEXT: v_readlane_b32 s81, v74, 25
-; GFX11-NEXT: v_readlane_b32 s80, v74, 24
-; GFX11-NEXT: v_readlane_b32 s71, v74, 23
-; GFX11-NEXT: v_readlane_b32 s70, v74, 22
-; GFX11-NEXT: v_readlane_b32 s69, v74, 21
-; GFX11-NEXT: v_readlane_b32 s68, v74, 20
-; GFX11-NEXT: v_readlane_b32 s67, v74, 19
-; GFX11-NEXT: v_readlane_b32 s66, v74, 18
-; GFX11-NEXT: v_readlane_b32 s65, v74, 17
-; GFX11-NEXT: v_readlane_b32 s64, v74, 16
-; GFX11-NEXT: v_readlane_b32 s55, v74, 15
-; GFX11-NEXT: v_readlane_b32 s54, v74, 14
-; GFX11-NEXT: v_readlane_b32 s53, v74, 13
-; GFX11-NEXT: v_readlane_b32 s52, v74, 12
-; GFX11-NEXT: v_readlane_b32 s51, v74, 11
-; GFX11-NEXT: v_readlane_b32 s50, v74, 10
-; GFX11-NEXT: v_readlane_b32 s49, v74, 9
-; GFX11-NEXT: v_readlane_b32 s48, v74, 8
-; GFX11-NEXT: v_readlane_b32 s39, v74, 7
-; GFX11-NEXT: v_readlane_b32 s38, v74, 6
-; GFX11-NEXT: v_readlane_b32 s37, v74, 5
-; GFX11-NEXT: v_readlane_b32 s36, v74, 4
-; GFX11-NEXT: v_readlane_b32 s35, v74, 3
-; GFX11-NEXT: v_readlane_b32 s34, v74, 2
-; GFX11-NEXT: v_readlane_b32 s31, v74, 1
-; GFX11-NEXT: v_readlane_b32 s30, v74, 0
+; GFX11-NEXT: v_readlane_b32 s30, v75, 7
+; GFX11-NEXT: v_readlane_b32 s31, v75, 8
+; GFX11-NEXT: v_readlane_b32 s104, v75, 6
+; GFX11-NEXT: v_readlane_b32 s103, v75, 5
+; GFX11-NEXT: v_readlane_b32 s102, v75, 4
+; GFX11-NEXT: v_readlane_b32 s101, v75, 3
+; GFX11-NEXT: v_readlane_b32 s100, v75, 2
+; GFX11-NEXT: v_readlane_b32 s99, v75, 1
+; GFX11-NEXT: v_readlane_b32 s98, v75, 0
+; GFX11-NEXT: v_readlane_b32 s97, v74, 31
+; GFX11-NEXT: v_readlane_b32 s96, v74, 30
+; GFX11-NEXT: v_readlane_b32 s87, v74, 29
+; GFX11-NEXT: v_readlane_b32 s86, v74, 28
+; GFX11-NEXT: v_readlane_b32 s85, v74, 27
+; GFX11-NEXT: v_readlane_b32 s84, v74, 26
+; GFX11-NEXT: v_readlane_b32 s83, v74, 25
+; GFX11-NEXT: v_readlane_b32 s82, v74, 24
+; GFX11-NEXT: v_readlane_b32 s81, v74, 23
+; GFX11-NEXT: v_readlane_b32 s80, v74, 22
+; GFX11-NEXT: v_readlane_b32 s71, v74, 21
+; GFX11-NEXT: v_readlane_b32 s70, v74, 20
+; GFX11-NEXT: v_readlane_b32 s69, v74, 19
+; GFX11-NEXT: v_readlane_b32 s68, v74, 18
+; GFX11-NEXT: v_readlane_b32 s67, v74, 17
+; GFX11-NEXT: v_readlane_b32 s66, v74, 16
+; GFX11-NEXT: v_readlane_b32 s65, v74, 15
+; GFX11-NEXT: v_readlane_b32 s64, v74, 14
+; GFX11-NEXT: v_readlane_b32 s55, v74, 13
+; GFX11-NEXT: v_readlane_b32 s54, v74, 12
+; GFX11-NEXT: v_readlane_b32 s53, v74, 11
+; GFX11-NEXT: v_readlane_b32 s52, v74, 10
+; GFX11-NEXT: v_readlane_b32 s51, v74, 9
+; GFX11-NEXT: v_readlane_b32 s50, v74, 8
+; GFX11-NEXT: v_readlane_b32 s49, v74, 7
+; GFX11-NEXT: v_readlane_b32 s48, v74, 6
+; GFX11-NEXT: v_readlane_b32 s39, v74, 5
+; GFX11-NEXT: v_readlane_b32 s38, v74, 4
+; GFX11-NEXT: v_readlane_b32 s37, v74, 3
+; GFX11-NEXT: v_readlane_b32 s36, v74, 2
+; GFX11-NEXT: v_readlane_b32 s35, v74, 1
+; GFX11-NEXT: v_readlane_b32 s34, v74, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72
@@ -55698,70 +55698,70 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v63, s30, 0
-; SI-NEXT: v_writelane_b32 v63, s31, 1
-; SI-NEXT: v_writelane_b32 v63, s34, 2
-; SI-NEXT: v_writelane_b32 v63, s35, 3
-; SI-NEXT: v_writelane_b32 v63, s36, 4
-; SI-NEXT: v_writelane_b32 v63, s37, 5
-; SI-NEXT: v_writelane_b32 v63, s38, 6
-; SI-NEXT: v_writelane_b32 v63, s39, 7
-; SI-NEXT: v_writelane_b32 v63, s48, 8
-; SI-NEXT: v_writelane_b32 v63, s49, 9
-; SI-NEXT: v_writelane_b32 v63, s50, 10
-; SI-NEXT: v_writelane_b32 v63, s51, 11
-; SI-NEXT: v_writelane_b32 v63, s52, 12
-; SI-NEXT: v_writelane_b32 v63, s53, 13
-; SI-NEXT: v_writelane_b32 v63, s54, 14
-; SI-NEXT: v_writelane_b32 v63, s55, 15
-; SI-NEXT: v_writelane_b32 v63, s64, 16
-; SI-NEXT: v_writelane_b32 v63, s65, 17
-; SI-NEXT: v_writelane_b32 v63, s66, 18
-; SI-NEXT: v_writelane_b32 v63, s67, 19
-; SI-NEXT: v_writelane_b32 v63, s68, 20
-; SI-NEXT: v_writelane_b32 v63, s69, 21
+; SI-NEXT: v_writelane_b32 v63, s34, 0
+; SI-NEXT: v_writelane_b32 v63, s35, 1
+; SI-NEXT: v_writelane_b32 v63, s36, 2
+; SI-NEXT: v_writelane_b32 v63, s37, 3
+; SI-NEXT: v_writelane_b32 v63, s38, 4
+; SI-NEXT: v_writelane_b32 v63, s39, 5
+; SI-NEXT: v_writelane_b32 v63, s48, 6
+; SI-NEXT: v_writelane_b32 v63, s49, 7
+; SI-NEXT: v_writelane_b32 v63, s50, 8
+; SI-NEXT: v_writelane_b32 v63, s51, 9
+; SI-NEXT: v_writelane_b32 v63, s52, 10
+; SI-NEXT: v_writelane_b32 v63, s53, 11
+; SI-NEXT: v_writelane_b32 v63, s54, 12
+; SI-NEXT: v_writelane_b32 v63, s55, 13
+; SI-NEXT: v_writelane_b32 v63, s64, 14
+; SI-NEXT: v_writelane_b32 v63, s65, 15
+; SI-NEXT: v_writelane_b32 v63, s66, 16
+; SI-NEXT: v_writelane_b32 v63, s67, 17
+; SI-NEXT: v_writelane_b32 v63, s68, 18
+; SI-NEXT: v_writelane_b32 v63, s69, 19
+; SI-NEXT: v_writelane_b32 v63, s70, 20
+; SI-NEXT: v_writelane_b32 v63, s71, 21
; SI-NEXT: v_mov_b32_e32 v19, s16
-; SI-NEXT: v_writelane_b32 v63, s70, 22
+; SI-NEXT: v_writelane_b32 v63, s80, 22
; SI-NEXT: v_readfirstlane_b32 s6, v19
; SI-NEXT: v_mov_b32_e32 v19, s17
-; SI-NEXT: v_writelane_b32 v63, s71, 23
+; SI-NEXT: v_writelane_b32 v63, s81, 23
; SI-NEXT: v_readfirstlane_b32 s7, v19
; SI-NEXT: v_mov_b32_e32 v19, s18
-; SI-NEXT: v_writelane_b32 v63, s80, 24
+; SI-NEXT: v_writelane_b32 v63, s82, 24
; SI-NEXT: v_readfirstlane_b32 s10, v19
; SI-NEXT: v_mov_b32_e32 v19, s19
-; SI-NEXT: v_writelane_b32 v63, s81, 25
+; SI-NEXT: v_writelane_b32 v63, s83, 25
; SI-NEXT: v_readfirstlane_b32 s12, v19
; SI-NEXT: v_mov_b32_e32 v19, s20
-; SI-NEXT: v_writelane_b32 v63, s82, 26
+; SI-NEXT: v_writelane_b32 v63, s84, 26
; SI-NEXT: v_readfirstlane_b32 s14, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
-; SI-NEXT: v_writelane_b32 v63, s83, 27
+; SI-NEXT: v_writelane_b32 v63, s85, 27
; SI-NEXT: v_readfirstlane_b32 s8, v19
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v63, s84, 28
+; SI-NEXT: v_writelane_b32 v63, s86, 28
; SI-NEXT: v_readfirstlane_b32 s9, v19
; SI-NEXT: v_mov_b32_e32 v19, s23
-; SI-NEXT: v_writelane_b32 v63, s85, 29
+; SI-NEXT: v_writelane_b32 v63, s87, 29
; SI-NEXT: v_readfirstlane_b32 s11, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
-; SI-NEXT: v_writelane_b32 v63, s86, 30
+; SI-NEXT: v_writelane_b32 v63, s96, 30
; SI-NEXT: v_readfirstlane_b32 s13, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
-; SI-NEXT: v_writelane_b32 v63, s87, 31
+; SI-NEXT: v_writelane_b32 v63, s97, 31
; SI-NEXT: v_readfirstlane_b32 s15, v19
; SI-NEXT: v_mov_b32_e32 v19, s26
-; SI-NEXT: v_writelane_b32 v63, s96, 32
+; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_readfirstlane_b32 s16, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
-; SI-NEXT: v_writelane_b32 v63, s97, 33
+; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_readfirstlane_b32 s17, v19
; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_writelane_b32 v63, s98, 34
+; SI-NEXT: v_writelane_b32 v63, s30, 34
; SI-NEXT: v_readfirstlane_b32 s18, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT: v_writelane_b32 v63, s99, 35
+; SI-NEXT: v_writelane_b32 v63, s31, 35
; SI-NEXT: v_readfirstlane_b32 s19, v19
; SI-NEXT: v_readfirstlane_b32 s20, v0
; SI-NEXT: v_readfirstlane_b32 s21, v1
@@ -56236,42 +56236,42 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54
; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50
-; SI-NEXT: v_readlane_b32 s99, v63, 35
-; SI-NEXT: v_readlane_b32 s98, v63, 34
-; SI-NEXT: v_readlane_b32 s97, v63, 33
-; SI-NEXT: v_readlane_b32 s96, v63, 32
-; SI-NEXT: v_readlane_b32 s87, v63, 31
-; SI-NEXT: v_readlane_b32 s86, v63, 30
-; SI-NEXT: v_readlane_b32 s85, v63, 29
-; SI-NEXT: v_readlane_b32 s84, v63, 28
-; SI-NEXT: v_readlane_b32 s83, v63, 27
-; SI-NEXT: v_readlane_b32 s82, v63, 26
-; SI-NEXT: v_readlane_b32 s81, v63, 25
-; SI-NEXT: v_readlane_b32 s80, v63, 24
-; SI-NEXT: v_readlane_b32 s71, v63, 23
-; SI-NEXT: v_readlane_b32 s70, v63, 22
-; SI-NEXT: v_readlane_b32 s69, v63, 21
-; SI-NEXT: v_readlane_b32 s68, v63, 20
-; SI-NEXT: v_readlane_b32 s67, v63, 19
-; SI-NEXT: v_readlane_b32 s66, v63, 18
-; SI-NEXT: v_readlane_b32 s65, v63, 17
-; SI-NEXT: v_readlane_b32 s64, v63, 16
-; SI-NEXT: v_readlane_b32 s55, v63, 15
-; SI-NEXT: v_readlane_b32 s54, v63, 14
-; SI-NEXT: v_readlane_b32 s53, v63, 13
-; SI-NEXT: v_readlane_b32 s52, v63, 12
-; SI-NEXT: v_readlane_b32 s51, v63, 11
-; SI-NEXT: v_readlane_b32 s50, v63, 10
-; SI-NEXT: v_readlane_b32 s49, v63, 9
-; SI-NEXT: v_readlane_b32 s48, v63, 8
-; SI-NEXT: v_readlane_b32 s39, v63, 7
-; SI-NEXT: v_readlane_b32 s38, v63, 6
-; SI-NEXT: v_readlane_b32 s37, v63, 5
-; SI-NEXT: v_readlane_b32 s36, v63, 4
-; SI-NEXT: v_readlane_b32 s35, v63, 3
-; SI-NEXT: v_readlane_b32 s34, v63, 2
-; SI-NEXT: v_readlane_b32 s31, v63, 1
-; SI-NEXT: v_readlane_b32 s30, v63, 0
+; SI-NEXT: v_readlane_b32 s30, v63, 34
+; SI-NEXT: v_readlane_b32 s31, v63, 35
+; SI-NEXT: v_readlane_b32 s99, v63, 33
+; SI-NEXT: v_readlane_b32 s98, v63, 32
+; SI-NEXT: v_readlane_b32 s97, v63, 31
+; SI-NEXT: v_readlane_b32 s96, v63, 30
+; SI-NEXT: v_readlane_b32 s87, v63, 29
+; SI-NEXT: v_readlane_b32 s86, v63, 28
+; SI-NEXT: v_readlane_b32 s85, v63, 27
+; SI-NEXT: v_readlane_b32 s84, v63, 26
+; SI-NEXT: v_readlane_b32 s83, v63, 25
+; SI-NEXT: v_readlane_b32 s82, v63, 24
+; SI-NEXT: v_readlane_b32 s81, v63, 23
+; SI-NEXT: v_readlane_b32 s80, v63, 22
+; SI-NEXT: v_readlane_b32 s71, v63, 21
+; SI-NEXT: v_readlane_b32 s70, v63, 20
+; SI-NEXT: v_readlane_b32 s69, v63, 19
+; SI-NEXT: v_readlane_b32 s68, v63, 18
+; SI-NEXT: v_readlane_b32 s67, v63, 17
+; SI-NEXT: v_readlane_b32 s66, v63, 16
+; SI-NEXT: v_readlane_b32 s65, v63, 15
+; SI-NEXT: v_readlane_b32 s64, v63, 14
+; SI-NEXT: v_readlane_b32 s55, v63, 13
+; SI-NEXT: v_readlane_b32 s54, v63, 12
+; SI-NEXT: v_readlane_b32 s53, v63, 11
+; SI-NEXT: v_readlane_b32 s52, v63, 10
+; SI-NEXT: v_readlane_b32 s51, v63, 9
+; SI-NEXT: v_readlane_b32 s50, v63, 8
+; SI-NEXT: v_readlane_b32 s49, v63, 7
+; SI-NEXT: v_readlane_b32 s48, v63, 6
+; SI-NEXT: v_readlane_b32 s39, v63, 5
+; SI-NEXT: v_readlane_b32 s38, v63, 4
+; SI-NEXT: v_readlane_b32 s37, v63, 3
+; SI-NEXT: v_readlane_b32 s36, v63, 2
+; SI-NEXT: v_readlane_b32 s35, v63, 1
+; SI-NEXT: v_readlane_b32 s34, v63, 0
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mul_f32_e32 v48, 1.0, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -73409,69 +73409,69 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v21, s30, 0
-; SI-NEXT: v_writelane_b32 v21, s31, 1
-; SI-NEXT: v_writelane_b32 v21, s34, 2
-; SI-NEXT: v_writelane_b32 v21, s35, 3
-; SI-NEXT: v_writelane_b32 v21, s36, 4
-; SI-NEXT: v_writelane_b32 v21, s37, 5
-; SI-NEXT: v_writelane_b32 v21, s38, 6
-; SI-NEXT: v_writelane_b32 v21, s39, 7
-; SI-NEXT: v_writelane_b32 v21, s48, 8
-; SI-NEXT: v_writelane_b32 v21, s49, 9
-; SI-NEXT: v_writelane_b32 v21, s50, 10
-; SI-NEXT: v_writelane_b32 v21, s51, 11
-; SI-NEXT: v_writelane_b32 v21, s52, 12
-; SI-NEXT: v_writelane_b32 v21, s53, 13
-; SI-NEXT: v_writelane_b32 v21, s54, 14
-; SI-NEXT: v_writelane_b32 v21, s55, 15
-; SI-NEXT: v_writelane_b32 v21, s64, 16
-; SI-NEXT: v_writelane_b32 v21, s65, 17
-; SI-NEXT: v_writelane_b32 v21, s66, 18
-; SI-NEXT: v_writelane_b32 v21, s67, 19
-; SI-NEXT: v_writelane_b32 v21, s68, 20
+; SI-NEXT: v_writelane_b32 v21, s34, 0
+; SI-NEXT: v_writelane_b32 v21, s35, 1
+; SI-NEXT: v_writelane_b32 v21, s36, 2
+; SI-NEXT: v_writelane_b32 v21, s37, 3
+; SI-NEXT: v_writelane_b32 v21, s38, 4
+; SI-NEXT: v_writelane_b32 v21, s39, 5
+; SI-NEXT: v_writelane_b32 v21, s48, 6
+; SI-NEXT: v_writelane_b32 v21, s49, 7
+; SI-NEXT: v_writelane_b32 v21, s50, 8
+; SI-NEXT: v_writelane_b32 v21, s51, 9
+; SI-NEXT: v_writelane_b32 v21, s52, 10
+; SI-NEXT: v_writelane_b32 v21, s53, 11
+; SI-NEXT: v_writelane_b32 v21, s54, 12
+; SI-NEXT: v_writelane_b32 v21, s55, 13
+; SI-NEXT: v_writelane_b32 v21, s64, 14
+; SI-NEXT: v_writelane_b32 v21, s65, 15
+; SI-NEXT: v_writelane_b32 v21, s66, 16
+; SI-NEXT: v_writelane_b32 v21, s67, 17
+; SI-NEXT: v_writelane_b32 v21, s68, 18
+; SI-NEXT: v_writelane_b32 v21, s69, 19
+; SI-NEXT: v_writelane_b32 v21, s70, 20
; SI-NEXT: v_mov_b32_e32 v20, s16
-; SI-NEXT: v_writelane_b32 v21, s69, 21
+; SI-NEXT: v_writelane_b32 v21, s71, 21
; SI-NEXT: v_readfirstlane_b32 s56, v20
; SI-NEXT: v_mov_b32_e32 v20, s17
-; SI-NEXT: v_writelane_b32 v21, s70, 22
+; SI-NEXT: v_writelane_b32 v21, s80, 22
; SI-NEXT: v_readfirstlane_b32 s57, v20
; SI-NEXT: v_mov_b32_e32 v20, s18
-; SI-NEXT: v_writelane_b32 v21, s71, 23
+; SI-NEXT: v_writelane_b32 v21, s81, 23
; SI-NEXT: v_readfirstlane_b32 s46, v20
; SI-NEXT: v_mov_b32_e32 v20, s19
-; SI-NEXT: v_writelane_b32 v21, s80, 24
+; SI-NEXT: v_writelane_b32 v21, s82, 24
; SI-NEXT: v_readfirstlane_b32 s47, v20
; SI-NEXT: v_mov_b32_e32 v20, s20
-; SI-NEXT: v_writelane_b32 v21, s81, 25
+; SI-NEXT: v_writelane_b32 v21, s83, 25
; SI-NEXT: v_readfirstlane_b32 s44, v20
; SI-NEXT: v_mov_b32_e32 v20, s21
-; SI-NEXT: v_writelane_b32 v21, s82, 26
+; SI-NEXT: v_writelane_b32 v21, s84, 26
; SI-NEXT: v_readfirstlane_b32 s45, v20
; SI-NEXT: v_mov_b32_e32 v20, s22
-; SI-NEXT: v_writelane_b32 v21, s83, 27
+; SI-NEXT: v_writelane_b32 v21, s85, 27
; SI-NEXT: v_readfirstlane_b32 s42, v20
; SI-NEXT: v_mov_b32_e32 v20, s23
-; SI-NEXT: v_writelane_b32 v21, s84, 28
+; SI-NEXT: v_writelane_b32 v21, s86, 28
; SI-NEXT: v_readfirstlane_b32 s43, v20
; SI-NEXT: v_mov_b32_e32 v20, s24
-; SI-NEXT: v_writelane_b32 v21, s85, 29
+; SI-NEXT: v_writelane_b32 v21, s87, 29
; SI-NEXT: v_readfirstlane_b32 s40, v20
; SI-NEXT: v_mov_b32_e32 v20, s25
-; SI-NEXT: v_writelane_b32 v21, s86, 30
+; SI-NEXT: v_writelane_b32 v21, s96, 30
; SI-NEXT: v_readfirstlane_b32 s41, v20
; SI-NEXT: v_mov_b32_e32 v20, s26
-; SI-NEXT: v_writelane_b32 v21, s87, 31
+; SI-NEXT: v_writelane_b32 v21, s97, 31
; SI-NEXT: v_readfirstlane_b32 s24, v20
; SI-NEXT: v_mov_b32_e32 v20, s27
-; SI-NEXT: v_writelane_b32 v21, s96, 32
+; SI-NEXT: v_writelane_b32 v21, s98, 32
; SI-NEXT: v_readfirstlane_b32 s25, v20
; SI-NEXT: v_mov_b32_e32 v20, s28
-; SI-NEXT: v_writelane_b32 v21, s97, 33
+; SI-NEXT: v_writelane_b32 v21, s99, 33
; SI-NEXT: v_readfirstlane_b32 s22, v20
; SI-NEXT: v_mov_b32_e32 v20, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT: v_writelane_b32 v21, s98, 34
+; SI-NEXT: v_writelane_b32 v21, s30, 34
; SI-NEXT: v_readfirstlane_b32 s23, v20
; SI-NEXT: v_readfirstlane_b32 s20, v1
; SI-NEXT: v_readfirstlane_b32 s21, v2
@@ -73492,7 +73492,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v17
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v18
-; SI-NEXT: v_writelane_b32 v21, s99, 35
+; SI-NEXT: v_writelane_b32 v21, s31, 35
; SI-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB57_4
@@ -74361,6 +74361,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v21, 34
; SI-NEXT: v_readlane_b32 s21, v23, 5
; SI-NEXT: v_readlane_b32 s19, v23, 11
; SI-NEXT: v_readlane_b32 s17, v23, 17
@@ -74369,42 +74370,41 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_readlane_b32 s11, v23, 35
; SI-NEXT: v_readlane_b32 s9, v23, 39
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s99, v21, 35
-; SI-NEXT: v_readlane_b32 s98, v21, 34
-; SI-NEXT: v_readlane_b32 s97, v21, 33
-; SI-NEXT: v_readlane_b32 s96, v21, 32
-; SI-NEXT: v_readlane_b32 s87, v21, 31
-; SI-NEXT: v_readlane_b32 s86, v21, 30
-; SI-NEXT: v_readlane_b32 s85, v21, 29
-; SI-NEXT: v_readlane_b32 s84, v21, 28
-; SI-NEXT: v_readlane_b32 s83, v21, 27
-; SI-NEXT: v_readlane_b32 s82, v21, 26
-; SI-NEXT: v_readlane_b32 s81, v21, 25
-; SI-NEXT: v_readlane_b32 s80, v21, 24
-; SI-NEXT: v_readlane_b32 s71, v21, 23
-; SI-NEXT: v_readlane_b32 s70, v21, 22
-; SI-NEXT: v_readlane_b32 s69, v21, 21
-; SI-NEXT: v_readlane_b32 s68, v21, 20
-; SI-NEXT: v_readlane_b32 s67, v21, 19
-; SI-NEXT: v_readlane_b32 s66, v21, 18
-; SI-NEXT: v_readlane_b32 s65, v21, 17
-; SI-NEXT: v_readlane_b32 s64, v21, 16
-; SI-NEXT: v_readlane_b32 s55, v21, 15
-; SI-NEXT: v_readlane_b32 s54, v21, 14
-; SI-NEXT: v_readlane_b32 s53, v21, 13
-; SI-NEXT: v_readlane_b32 s52, v21, 12
-; SI-NEXT: v_readlane_b32 s51, v21, 11
-; SI-NEXT: v_readlane_b32 s50, v21, 10
-; SI-NEXT: v_readlane_b32 s49, v21, 9
-; SI-NEXT: v_readlane_b32 s48, v21, 8
-; SI-NEXT: v_readlane_b32 s39, v21, 7
-; SI-NEXT: v_readlane_b32 s38, v21, 6
-; SI-NEXT: v_readlane_b32 s37, v21, 5
-; SI-NEXT: v_readlane_b32 s36, v21, 4
-; SI-NEXT: v_readlane_b32 s35, v21, 3
-; SI-NEXT: v_readlane_b32 s34, v21, 2
-; SI-NEXT: v_readlane_b32 s31, v21, 1
-; SI-NEXT: v_readlane_b32 s30, v21, 0
+; SI-NEXT: v_readlane_b32 s31, v21, 35
+; SI-NEXT: v_readlane_b32 s99, v21, 33
+; SI-NEXT: v_readlane_b32 s98, v21, 32
+; SI-NEXT: v_readlane_b32 s97, v21, 31
+; SI-NEXT: v_readlane_b32 s96, v21, 30
+; SI-NEXT: v_readlane_b32 s87, v21, 29
+; SI-NEXT: v_readlane_b32 s86, v21, 28
+; SI-NEXT: v_readlane_b32 s85, v21, 27
+; SI-NEXT: v_readlane_b32 s84, v21, 26
+; SI-NEXT: v_readlane_b32 s83, v21, 25
+; SI-NEXT: v_readlane_b32 s82, v21, 24
+; SI-NEXT: v_readlane_b32 s81, v21, 23
+; SI-NEXT: v_readlane_b32 s80, v21, 22
+; SI-NEXT: v_readlane_b32 s71, v21, 21
+; SI-NEXT: v_readlane_b32 s70, v21, 20
+; SI-NEXT: v_readlane_b32 s69, v21, 19
+; SI-NEXT: v_readlane_b32 s68, v21, 18
+; SI-NEXT: v_readlane_b32 s67, v21, 17
+; SI-NEXT: v_readlane_b32 s66, v21, 16
+; SI-NEXT: v_readlane_b32 s65, v21, 15
+; SI-NEXT: v_readlane_b32 s64, v21, 14
+; SI-NEXT: v_readlane_b32 s55, v21, 13
+; SI-NEXT: v_readlane_b32 s54, v21, 12
+; SI-NEXT: v_readlane_b32 s53, v21, 11
+; SI-NEXT: v_readlane_b32 s52, v21, 10
+; SI-NEXT: v_readlane_b32 s51, v21, 9
+; SI-NEXT: v_readlane_b32 s50, v21, 8
+; SI-NEXT: v_readlane_b32 s49, v21, 7
+; SI-NEXT: v_readlane_b32 s48, v21, 6
+; SI-NEXT: v_readlane_b32 s39, v21, 5
+; SI-NEXT: v_readlane_b32 s38, v21, 4
+; SI-NEXT: v_readlane_b32 s37, v21, 3
+; SI-NEXT: v_readlane_b32 s36, v21, 2
+; SI-NEXT: v_readlane_b32 s35, v21, 1
+; SI-NEXT: v_readlane_b32 s34, v21, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -74601,65 +74601,65 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v32, s30, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 1
-; VI-NEXT: v_writelane_b32 v32, s34, 2
-; VI-NEXT: v_writelane_b32 v32, s35, 3
-; VI-NEXT: v_writelane_b32 v32, s36, 4
-; VI-NEXT: v_writelane_b32 v32, s37, 5
-; VI-NEXT: v_writelane_b32 v32, s38, 6
-; VI-NEXT: v_writelane_b32 v32, s39, 7
-; VI-NEXT: v_writelane_b32 v32, s48, 8
-; VI-NEXT: v_writelane_b32 v32, s49, 9
-; VI-NEXT: v_writelane_b32 v32, s50, 10
-; VI-NEXT: v_writelane_b32 v32, s51, 11
-; VI-NEXT: v_writelane_b32 v32, s52, 12
-; VI-NEXT: v_writelane_b32 v32, s53, 13
-; VI-NEXT: v_writelane_b32 v32, s54, 14
-; VI-NEXT: v_writelane_b32 v32, s55, 15
-; VI-NEXT: v_writelane_b32 v32, s64, 16
+; VI-NEXT: v_writelane_b32 v32, s34, 0
+; VI-NEXT: v_writelane_b32 v32, s35, 1
+; VI-NEXT: v_writelane_b32 v32, s36, 2
+; VI-NEXT: v_writelane_b32 v32, s37, 3
+; VI-NEXT: v_writelane_b32 v32, s38, 4
+; VI-NEXT: v_writelane_b32 v32, s39, 5
+; VI-NEXT: v_writelane_b32 v32, s48, 6
+; VI-NEXT: v_writelane_b32 v32, s49, 7
+; VI-NEXT: v_writelane_b32 v32, s50, 8
+; VI-NEXT: v_writelane_b32 v32, s51, 9
+; VI-NEXT: v_writelane_b32 v32, s52, 10
+; VI-NEXT: v_writelane_b32 v32, s53, 11
+; VI-NEXT: v_writelane_b32 v32, s54, 12
+; VI-NEXT: v_writelane_b32 v32, s55, 13
+; VI-NEXT: v_writelane_b32 v32, s64, 14
+; VI-NEXT: v_writelane_b32 v32, s65, 15
+; VI-NEXT: v_writelane_b32 v32, s66, 16
; VI-NEXT: v_mov_b32_e32 v20, s16
-; VI-NEXT: v_writelane_b32 v32, s65, 17
+; VI-NEXT: v_writelane_b32 v32, s67, 17
; VI-NEXT: v_readfirstlane_b32 s56, v20
; VI-NEXT: v_mov_b32_e32 v20, s17
-; VI-NEXT: v_writelane_b32 v32, s66, 18
+; VI-NEXT: v_writelane_b32 v32, s68, 18
; VI-NEXT: v_readfirstlane_b32 s57, v20
; VI-NEXT: v_mov_b32_e32 v20, s18
-; VI-NEXT: v_writelane_b32 v32, s67, 19
+; VI-NEXT: v_writelane_b32 v32, s69, 19
; VI-NEXT: v_readfirstlane_b32 s46, v20
; VI-NEXT: v_mov_b32_e32 v20, s19
-; VI-NEXT: v_writelane_b32 v32, s68, 20
+; VI-NEXT: v_writelane_b32 v32, s70, 20
; VI-NEXT: v_readfirstlane_b32 s47, v20
; VI-NEXT: v_mov_b32_e32 v20, s20
-; VI-NEXT: v_writelane_b32 v32, s69, 21
+; VI-NEXT: v_writelane_b32 v32, s71, 21
; VI-NEXT: v_readfirstlane_b32 s44, v20
; VI-NEXT: v_mov_b32_e32 v20, s21
-; VI-NEXT: v_writelane_b32 v32, s70, 22
+; VI-NEXT: v_writelane_b32 v32, s80, 22
; VI-NEXT: v_readfirstlane_b32 s45, v20
; VI-NEXT: v_mov_b32_e32 v20, s22
-; VI-NEXT: v_writelane_b32 v32, s71, 23
+; VI-NEXT: v_writelane_b32 v32, s81, 23
; VI-NEXT: v_readfirstlane_b32 s42, v20
; VI-NEXT: v_mov_b32_e32 v20, s23
-; VI-NEXT: v_writelane_b32 v32, s80, 24
+; VI-NEXT: v_writelane_b32 v32, s82, 24
; VI-NEXT: v_readfirstlane_b32 s43, v20
; VI-NEXT: v_mov_b32_e32 v20, s24
-; VI-NEXT: v_writelane_b32 v32, s81, 25
+; VI-NEXT: v_writelane_b32 v32, s83, 25
; VI-NEXT: v_readfirstlane_b32 s40, v20
; VI-NEXT: v_mov_b32_e32 v20, s25
-; VI-NEXT: v_writelane_b32 v32, s82, 26
+; VI-NEXT: v_writelane_b32 v32, s84, 26
; VI-NEXT: v_readfirstlane_b32 s41, v20
; VI-NEXT: v_mov_b32_e32 v20, s26
-; VI-NEXT: v_writelane_b32 v32, s83, 27
+; VI-NEXT: v_writelane_b32 v32, s85, 27
; VI-NEXT: v_readfirstlane_b32 s24, v20
; VI-NEXT: v_mov_b32_e32 v20, s27
-; VI-NEXT: v_writelane_b32 v32, s84, 28
+; VI-NEXT: v_writelane_b32 v32, s86, 28
; VI-NEXT: v_readfirstlane_b32 s25, v20
; VI-NEXT: v_mov_b32_e32 v20, s28
-; VI-NEXT: v_writelane_b32 v32, s85, 29
+; VI-NEXT: v_writelane_b32 v32, s87, 29
; VI-NEXT: v_readfirstlane_b32 s22, v20
; VI-NEXT: v_mov_b32_e32 v20, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; VI-NEXT: v_writelane_b32 v32, s86, 30
+; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: v_readfirstlane_b32 s23, v20
; VI-NEXT: v_readfirstlane_b32 s20, v1
; VI-NEXT: v_readfirstlane_b32 s21, v2
@@ -74680,7 +74680,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: s_and_b64 s[26:27], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v18
-; VI-NEXT: v_writelane_b32 v32, s87, 31
+; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB57_4
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -75339,40 +75339,40 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: v_readlane_b32 s30, v32, 30
; VI-NEXT: v_readlane_b32 s7, v33, 1
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s87, v32, 31
-; VI-NEXT: v_readlane_b32 s86, v32, 30
-; VI-NEXT: v_readlane_b32 s85, v32, 29
-; VI-NEXT: v_readlane_b32 s84, v32, 28
-; VI-NEXT: v_readlane_b32 s83, v32, 27
-; VI-NEXT: v_readlane_b32 s82, v32, 26
-; VI-NEXT: v_readlane_b32 s81, v32, 25
-; VI-NEXT: v_readlane_b32 s80, v32, 24
-; VI-NEXT: v_readlane_b32 s71, v32, 23
-; VI-NEXT: v_readlane_b32 s70, v32, 22
-; VI-NEXT: v_readlane_b32 s69, v32, 21
-; VI-NEXT: v_readlane_b32 s68, v32, 20
-; VI-NEXT: v_readlane_b32 s67, v32, 19
-; VI-NEXT: v_readlane_b32 s66, v32, 18
-; VI-NEXT: v_readlane_b32 s65, v32, 17
-; VI-NEXT: v_readlane_b32 s64, v32, 16
-; VI-NEXT: v_readlane_b32 s55, v32, 15
-; VI-NEXT: v_readlane_b32 s54, v32, 14
-; VI-NEXT: v_readlane_b32 s53, v32, 13
-; VI-NEXT: v_readlane_b32 s52, v32, 12
-; VI-NEXT: v_readlane_b32 s51, v32, 11
-; VI-NEXT: v_readlane_b32 s50, v32, 10
-; VI-NEXT: v_readlane_b32 s49, v32, 9
-; VI-NEXT: v_readlane_b32 s48, v32, 8
-; VI-NEXT: v_readlane_b32 s39, v32, 7
-; VI-NEXT: v_readlane_b32 s38, v32, 6
-; VI-NEXT: v_readlane_b32 s37, v32, 5
-; VI-NEXT: v_readlane_b32 s36, v32, 4
-; VI-NEXT: v_readlane_b32 s35, v32, 3
-; VI-NEXT: v_readlane_b32 s34, v32, 2
-; VI-NEXT: v_readlane_b32 s31, v32, 1
-; VI-NEXT: v_readlane_b32 s30, v32, 0
+; VI-NEXT: v_readlane_b32 s31, v32, 31
+; VI-NEXT: v_readlane_b32 s87, v32, 29
+; VI-NEXT: v_readlane_b32 s86, v32, 28
+; VI-NEXT: v_readlane_b32 s85, v32, 27
+; VI-NEXT: v_readlane_b32 s84, v32, 26
+; VI-NEXT: v_readlane_b32 s83, v32, 25
+; VI-NEXT: v_readlane_b32 s82, v32, 24
+; VI-NEXT: v_readlane_b32 s81, v32, 23
+; VI-NEXT: v_readlane_b32 s80, v32, 22
+; VI-NEXT: v_readlane_b32 s71, v32, 21
+; VI-NEXT: v_readlane_b32 s70, v32, 20
+; VI-NEXT: v_readlane_b32 s69, v32, 19
+; VI-NEXT: v_readlane_b32 s68, v32, 18
+; VI-NEXT: v_readlane_b32 s67, v32, 17
+; VI-NEXT: v_readlane_b32 s66, v32, 16
+; VI-NEXT: v_readlane_b32 s65, v32, 15
+; VI-NEXT: v_readlane_b32 s64, v32, 14
+; VI-NEXT: v_readlane_b32 s55, v32, 13
+; VI-NEXT: v_readlane_b32 s54, v32, 12
+; VI-NEXT: v_readlane_b32 s53, v32, 11
+; VI-NEXT: v_readlane_b32 s52, v32, 10
+; VI-NEXT: v_readlane_b32 s51, v32, 9
+; VI-NEXT: v_readlane_b32 s50, v32, 8
+; VI-NEXT: v_readlane_b32 s49, v32, 7
+; VI-NEXT: v_readlane_b32 s48, v32, 6
+; VI-NEXT: v_readlane_b32 s39, v32, 5
+; VI-NEXT: v_readlane_b32 s38, v32, 4
+; VI-NEXT: v_readlane_b32 s37, v32, 3
+; VI-NEXT: v_readlane_b32 s36, v32, 2
+; VI-NEXT: v_readlane_b32 s35, v32, 1
+; VI-NEXT: v_readlane_b32 s34, v32, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -75543,69 +75543,69 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v29, s30, 0
-; GFX9-NEXT: v_writelane_b32 v29, s31, 1
-; GFX9-NEXT: v_writelane_b32 v29, s34, 2
-; GFX9-NEXT: v_writelane_b32 v29, s35, 3
-; GFX9-NEXT: v_writelane_b32 v29, s36, 4
-; GFX9-NEXT: v_writelane_b32 v29, s37, 5
-; GFX9-NEXT: v_writelane_b32 v29, s38, 6
-; GFX9-NEXT: v_writelane_b32 v29, s39, 7
-; GFX9-NEXT: v_writelane_b32 v29, s48, 8
-; GFX9-NEXT: v_writelane_b32 v29, s49, 9
-; GFX9-NEXT: v_writelane_b32 v29, s50, 10
-; GFX9-NEXT: v_writelane_b32 v29, s51, 11
-; GFX9-NEXT: v_writelane_b32 v29, s52, 12
-; GFX9-NEXT: v_writelane_b32 v29, s53, 13
-; GFX9-NEXT: v_writelane_b32 v29, s54, 14
-; GFX9-NEXT: v_writelane_b32 v29, s55, 15
-; GFX9-NEXT: v_writelane_b32 v29, s64, 16
-; GFX9-NEXT: v_writelane_b32 v29, s65, 17
-; GFX9-NEXT: v_writelane_b32 v29, s66, 18
-; GFX9-NEXT: v_writelane_b32 v29, s67, 19
-; GFX9-NEXT: v_writelane_b32 v29, s68, 20
+; GFX9-NEXT: v_writelane_b32 v29, s34, 0
+; GFX9-NEXT: v_writelane_b32 v29, s35, 1
+; GFX9-NEXT: v_writelane_b32 v29, s36, 2
+; GFX9-NEXT: v_writelane_b32 v29, s37, 3
+; GFX9-NEXT: v_writelane_b32 v29, s38, 4
+; GFX9-NEXT: v_writelane_b32 v29, s39, 5
+; GFX9-NEXT: v_writelane_b32 v29, s48, 6
+; GFX9-NEXT: v_writelane_b32 v29, s49, 7
+; GFX9-NEXT: v_writelane_b32 v29, s50, 8
+; GFX9-NEXT: v_writelane_b32 v29, s51, 9
+; GFX9-NEXT: v_writelane_b32 v29, s52, 10
+; GFX9-NEXT: v_writelane_b32 v29, s53, 11
+; GFX9-NEXT: v_writelane_b32 v29, s54, 12
+; GFX9-NEXT: v_writelane_b32 v29, s55, 13
+; GFX9-NEXT: v_writelane_b32 v29, s64, 14
+; GFX9-NEXT: v_writelane_b32 v29, s65, 15
+; GFX9-NEXT: v_writelane_b32 v29, s66, 16
+; GFX9-NEXT: v_writelane_b32 v29, s67, 17
+; GFX9-NEXT: v_writelane_b32 v29, s68, 18
+; GFX9-NEXT: v_writelane_b32 v29, s69, 19
+; GFX9-NEXT: v_writelane_b32 v29, s70, 20
; GFX9-NEXT: v_mov_b32_e32 v20, s16
-; GFX9-NEXT: v_writelane_b32 v29, s69, 21
+; GFX9-NEXT: v_writelane_b32 v29, s71, 21
; GFX9-NEXT: v_readfirstlane_b32 s56, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s17
-; GFX9-NEXT: v_writelane_b32 v29, s70, 22
+; GFX9-NEXT: v_writelane_b32 v29, s80, 22
; GFX9-NEXT: v_readfirstlane_b32 s57, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s18
-; GFX9-NEXT: v_writelane_b32 v29, s71, 23
+; GFX9-NEXT: v_writelane_b32 v29, s81, 23
; GFX9-NEXT: v_readfirstlane_b32 s46, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s19
-; GFX9-NEXT: v_writelane_b32 v29, s80, 24
+; GFX9-NEXT: v_writelane_b32 v29, s82, 24
; GFX9-NEXT: v_readfirstlane_b32 s47, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s20
-; GFX9-NEXT: v_writelane_b32 v29, s81, 25
+; GFX9-NEXT: v_writelane_b32 v29, s83, 25
; GFX9-NEXT: v_readfirstlane_b32 s44, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s21
-; GFX9-NEXT: v_writelane_b32 v29, s82, 26
+; GFX9-NEXT: v_writelane_b32 v29, s84, 26
; GFX9-NEXT: v_readfirstlane_b32 s45, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s22
-; GFX9-NEXT: v_writelane_b32 v29, s83, 27
+; GFX9-NEXT: v_writelane_b32 v29, s85, 27
; GFX9-NEXT: v_readfirstlane_b32 s42, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s23
-; GFX9-NEXT: v_writelane_b32 v29, s84, 28
+; GFX9-NEXT: v_writelane_b32 v29, s86, 28
; GFX9-NEXT: v_readfirstlane_b32 s43, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s24
-; GFX9-NEXT: v_writelane_b32 v29, s85, 29
+; GFX9-NEXT: v_writelane_b32 v29, s87, 29
; GFX9-NEXT: v_readfirstlane_b32 s40, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s25
-; GFX9-NEXT: v_writelane_b32 v29, s86, 30
+; GFX9-NEXT: v_writelane_b32 v29, s96, 30
; GFX9-NEXT: v_readfirstlane_b32 s41, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s26
-; GFX9-NEXT: v_writelane_b32 v29, s87, 31
+; GFX9-NEXT: v_writelane_b32 v29, s97, 31
; GFX9-NEXT: v_readfirstlane_b32 s24, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s27
-; GFX9-NEXT: v_writelane_b32 v29, s96, 32
+; GFX9-NEXT: v_writelane_b32 v29, s98, 32
; GFX9-NEXT: v_readfirstlane_b32 s25, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s28
-; GFX9-NEXT: v_writelane_b32 v29, s97, 33
+; GFX9-NEXT: v_writelane_b32 v29, s99, 33
; GFX9-NEXT: v_readfirstlane_b32 s22, v20
; GFX9-NEXT: v_mov_b32_e32 v20, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; GFX9-NEXT: v_writelane_b32 v29, s98, 34
+; GFX9-NEXT: v_writelane_b32 v29, s30, 34
; GFX9-NEXT: v_readfirstlane_b32 s23, v20
; GFX9-NEXT: v_readfirstlane_b32 s20, v1
; GFX9-NEXT: v_readfirstlane_b32 s21, v2
@@ -75626,7 +75626,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s4, v17
; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v18
-; GFX9-NEXT: v_writelane_b32 v29, s99, 35
+; GFX9-NEXT: v_writelane_b32 v29, s31, 35
; GFX9-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
; GFX9-NEXT: s_cbranch_scc0 .LBB57_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
@@ -76231,43 +76231,43 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX9-NEXT: v_perm_b32 v1, s4, v3, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT: v_readlane_b32 s30, v29, 34
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: v_readlane_b32 s99, v29, 35
-; GFX9-NEXT: v_readlane_b32 s98, v29, 34
-; GFX9-NEXT: v_readlane_b32 s97, v29, 33
-; GFX9-NEXT: v_readlane_b32 s96, v29, 32
-; GFX9-NEXT: v_readlane_b32 s87, v29, 31
-; GFX9-NEXT: v_readlane_b32 s86, v29, 30
-; GFX9-NEXT: v_readlane_b32 s85, v29, 29
-; GFX9-NEXT: v_readlane_b32 s84, v29, 28
-; GFX9-NEXT: v_readlane_b32 s83, v29, 27
-; GFX9-NEXT: v_readlane_b32 s82, v29, 26
-; GFX9-NEXT: v_readlane_b32 s81, v29, 25
-; GFX9-NEXT: v_readlane_b32 s80, v29, 24
-; GFX9-NEXT: v_readlane_b32 s71, v29, 23
-; GFX9-NEXT: v_readlane_b32 s70, v29, 22
-; GFX9-NEXT: v_readlane_b32 s69, v29, 21
-; GFX9-NEXT: v_readlane_b32 s68, v29, 20
-; GFX9-NEXT: v_readlane_b32 s67, v29, 19
-; GFX9-NEXT: v_readlane_b32 s66, v29, 18
-; GFX9-NEXT: v_readlane_b32 s65, v29, 17
-; GFX9-NEXT: v_readlane_b32 s64, v29, 16
-; GFX9-NEXT: v_readlane_b32 s55, v29, 15
-; GFX9-NEXT: v_readlane_b32 s54, v29, 14
-; GFX9-NEXT: v_readlane_b32 s53, v29, 13
-; GFX9-NEXT: v_readlane_b32 s52, v29, 12
-; GFX9-NEXT: v_readlane_b32 s51, v29, 11
-; GFX9-NEXT: v_readlane_b32 s50, v29, 10
-; GFX9-NEXT: v_readlane_b32 s49, v29, 9
-; GFX9-NEXT: v_readlane_b32 s48, v29, 8
-; GFX9-NEXT: v_readlane_b32 s39, v29, 7
-; GFX9-NEXT: v_readlane_b32 s38, v29, 6
-; GFX9-NEXT: v_readlane_b32 s37, v29, 5
-; GFX9-NEXT: v_readlane_b32 s36, v29, 4
-; GFX9-NEXT: v_readlane_b32 s35, v29, 3
-; GFX9-NEXT: v_readlane_b32 s34, v29, 2
-; GFX9-NEXT: v_readlane_b32 s31, v29, 1
-; GFX9-NEXT: v_readlane_b32 s30, v29, 0
+; GFX9-NEXT: v_readlane_b32 s31, v29, 35
+; GFX9-NEXT: v_readlane_b32 s99, v29, 33
+; GFX9-NEXT: v_readlane_b32 s98, v29, 32
+; GFX9-NEXT: v_readlane_b32 s97, v29, 31
+; GFX9-NEXT: v_readlane_b32 s96, v29, 30
+; GFX9-NEXT: v_readlane_b32 s87, v29, 29
+; GFX9-NEXT: v_readlane_b32 s86, v29, 28
+; GFX9-NEXT: v_readlane_b32 s85, v29, 27
+; GFX9-NEXT: v_readlane_b32 s84, v29, 26
+; GFX9-NEXT: v_readlane_b32 s83, v29, 25
+; GFX9-NEXT: v_readlane_b32 s82, v29, 24
+; GFX9-NEXT: v_readlane_b32 s81, v29, 23
+; GFX9-NEXT: v_readlane_b32 s80, v29, 22
+; GFX9-NEXT: v_readlane_b32 s71, v29, 21
+; GFX9-NEXT: v_readlane_b32 s70, v29, 20
+; GFX9-NEXT: v_readlane_b32 s69, v29, 19
+; GFX9-NEXT: v_readlane_b32 s68, v29, 18
+; GFX9-NEXT: v_readlane_b32 s67, v29, 17
+; GFX9-NEXT: v_readlane_b32 s66, v29, 16
+; GFX9-NEXT: v_readlane_b32 s65, v29, 15
+; GFX9-NEXT: v_readlane_b32 s64, v29, 14
+; GFX9-NEXT: v_readlane_b32 s55, v29, 13
+; GFX9-NEXT: v_readlane_b32 s54, v29, 12
+; GFX9-NEXT: v_readlane_b32 s53, v29, 11
+; GFX9-NEXT: v_readlane_b32 s52, v29, 10
+; GFX9-NEXT: v_readlane_b32 s51, v29, 9
+; GFX9-NEXT: v_readlane_b32 s50, v29, 8
+; GFX9-NEXT: v_readlane_b32 s49, v29, 7
+; GFX9-NEXT: v_readlane_b32 s48, v29, 6
+; GFX9-NEXT: v_readlane_b32 s39, v29, 5
+; GFX9-NEXT: v_readlane_b32 s38, v29, 4
+; GFX9-NEXT: v_readlane_b32 s37, v29, 3
+; GFX9-NEXT: v_readlane_b32 s36, v29, 2
+; GFX9-NEXT: v_readlane_b32 s35, v29, 1
+; GFX9-NEXT: v_readlane_b32 s34, v29, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -76433,93 +76433,93 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v36, s32 offset:8
; GFX11-NEXT: scratch_store_b32 off, v37, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v34, s30, 0
-; GFX11-NEXT: v_writelane_b32 v35, s96, 0
+; GFX11-NEXT: v_writelane_b32 v34, s34, 0
+; GFX11-NEXT: v_writelane_b32 v35, s98, 0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: v_writelane_b32 v34, s31, 1
-; GFX11-NEXT: v_writelane_b32 v35, s97, 1
+; GFX11-NEXT: v_writelane_b32 v34, s35, 1
+; GFX11-NEXT: v_writelane_b32 v35, s99, 1
; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT: v_writelane_b32 v34, s34, 2
-; GFX11-NEXT: v_writelane_b32 v35, s98, 2
+; GFX11-NEXT: v_writelane_b32 v34, s36, 2
+; GFX11-NEXT: v_writelane_b32 v35, s100, 2
; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21
; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23
-; GFX11-NEXT: v_writelane_b32 v34, s35, 3
-; GFX11-NEXT: v_writelane_b32 v35, s99, 3
+; GFX11-NEXT: v_writelane_b32 v34, s37, 3
+; GFX11-NEXT: v_writelane_b32 v35, s101, 3
; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25
; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27
-; GFX11-NEXT: v_writelane_b32 v34, s36, 4
-; GFX11-NEXT: v_writelane_b32 v35, s100, 4
+; GFX11-NEXT: v_writelane_b32 v34, s38, 4
+; GFX11-NEXT: v_writelane_b32 v35, s102, 4
; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT: v_writelane_b32 v34, s37, 5
-; GFX11-NEXT: v_writelane_b32 v35, s101, 5
+; GFX11-NEXT: v_writelane_b32 v34, s39, 5
+; GFX11-NEXT: v_writelane_b32 v35, s103, 5
; GFX11-NEXT: v_readfirstlane_b32 s40, v16
; GFX11-NEXT: v_readfirstlane_b32 s41, v17
; GFX11-NEXT: v_readfirstlane_b32 s28, v18
-; GFX11-NEXT: v_writelane_b32 v34, s38, 6
-; GFX11-NEXT: v_writelane_b32 v35, s102, 6
+; GFX11-NEXT: v_writelane_b32 v34, s48, 6
+; GFX11-NEXT: v_writelane_b32 v35, s104, 6
; GFX11-NEXT: v_readfirstlane_b32 s29, v19
; GFX11-NEXT: v_readfirstlane_b32 s26, v20
; GFX11-NEXT: v_readfirstlane_b32 s27, v21
-; GFX11-NEXT: v_writelane_b32 v34, s39, 7
-; GFX11-NEXT: v_writelane_b32 v35, s103, 7
+; GFX11-NEXT: v_writelane_b32 v34, s49, 7
+; GFX11-NEXT: v_writelane_b32 v35, s30, 7
; GFX11-NEXT: v_readfirstlane_b32 s24, v22
; GFX11-NEXT: v_readfirstlane_b32 s25, v23
; GFX11-NEXT: v_readfirstlane_b32 s22, v24
-; GFX11-NEXT: v_writelane_b32 v34, s48, 8
+; GFX11-NEXT: v_writelane_b32 v34, s50, 8
; GFX11-NEXT: v_readfirstlane_b32 s23, v25
; GFX11-NEXT: v_readfirstlane_b32 s20, v26
; GFX11-NEXT: v_readfirstlane_b32 s21, v27
; GFX11-NEXT: v_readfirstlane_b32 s18, v28
-; GFX11-NEXT: v_writelane_b32 v34, s49, 9
+; GFX11-NEXT: v_writelane_b32 v34, s51, 9
; GFX11-NEXT: v_readfirstlane_b32 s19, v29
; GFX11-NEXT: v_readfirstlane_b32 s16, v30
; GFX11-NEXT: v_readfirstlane_b32 s17, v31
; GFX11-NEXT: v_readfirstlane_b32 s14, v32
-; GFX11-NEXT: v_writelane_b32 v34, s50, 10
+; GFX11-NEXT: v_writelane_b32 v34, s52, 10
; GFX11-NEXT: v_readfirstlane_b32 s15, v33
; GFX11-NEXT: v_readfirstlane_b32 s12, v1
; GFX11-NEXT: v_readfirstlane_b32 s13, v2
; GFX11-NEXT: v_readfirstlane_b32 s10, v3
-; GFX11-NEXT: v_writelane_b32 v34, s51, 11
+; GFX11-NEXT: v_writelane_b32 v34, s53, 11
; GFX11-NEXT: v_readfirstlane_b32 s11, v4
; GFX11-NEXT: v_readfirstlane_b32 s8, v5
; GFX11-NEXT: v_readfirstlane_b32 s9, v6
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
-; GFX11-NEXT: v_writelane_b32 v34, s52, 12
+; GFX11-NEXT: v_writelane_b32 v34, s54, 12
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s2, v11
-; GFX11-NEXT: v_writelane_b32 v34, s53, 13
+; GFX11-NEXT: v_writelane_b32 v34, s55, 13
; GFX11-NEXT: v_readfirstlane_b32 s3, v12
; GFX11-NEXT: v_readfirstlane_b32 s0, v13
; GFX11-NEXT: v_readfirstlane_b32 s1, v14
; GFX11-NEXT: s_mov_b32 vcc_hi, 0
-; GFX11-NEXT: v_writelane_b32 v34, s54, 14
+; GFX11-NEXT: v_writelane_b32 v34, s64, 14
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: v_writelane_b32 v35, s104, 8
+; GFX11-NEXT: v_writelane_b32 v35, s31, 8
; GFX11-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane
; GFX11-NEXT: ; implicit-def: $vgpr36 : SGPR spill to VGPR lane
-; GFX11-NEXT: v_writelane_b32 v34, s55, 15
-; GFX11-NEXT: v_writelane_b32 v34, s64, 16
-; GFX11-NEXT: v_writelane_b32 v34, s65, 17
-; GFX11-NEXT: v_writelane_b32 v34, s66, 18
-; GFX11-NEXT: v_writelane_b32 v34, s67, 19
-; GFX11-NEXT: v_writelane_b32 v34, s68, 20
-; GFX11-NEXT: v_writelane_b32 v34, s69, 21
-; GFX11-NEXT: v_writelane_b32 v34, s70, 22
-; GFX11-NEXT: v_writelane_b32 v34, s71, 23
-; GFX11-NEXT: v_writelane_b32 v34, s80, 24
-; GFX11-NEXT: v_writelane_b32 v34, s81, 25
-; GFX11-NEXT: v_writelane_b32 v34, s82, 26
-; GFX11-NEXT: v_writelane_b32 v34, s83, 27
-; GFX11-NEXT: v_writelane_b32 v34, s84, 28
-; GFX11-NEXT: v_writelane_b32 v34, s85, 29
-; GFX11-NEXT: v_writelane_b32 v34, s86, 30
-; GFX11-NEXT: v_writelane_b32 v34, s87, 31
+; GFX11-NEXT: v_writelane_b32 v34, s65, 15
+; GFX11-NEXT: v_writelane_b32 v34, s66, 16
+; GFX11-NEXT: v_writelane_b32 v34, s67, 17
+; GFX11-NEXT: v_writelane_b32 v34, s68, 18
+; GFX11-NEXT: v_writelane_b32 v34, s69, 19
+; GFX11-NEXT: v_writelane_b32 v34, s70, 20
+; GFX11-NEXT: v_writelane_b32 v34, s71, 21
+; GFX11-NEXT: v_writelane_b32 v34, s80, 22
+; GFX11-NEXT: v_writelane_b32 v34, s81, 23
+; GFX11-NEXT: v_writelane_b32 v34, s82, 24
+; GFX11-NEXT: v_writelane_b32 v34, s83, 25
+; GFX11-NEXT: v_writelane_b32 v34, s84, 26
+; GFX11-NEXT: v_writelane_b32 v34, s85, 27
+; GFX11-NEXT: v_writelane_b32 v34, s86, 28
+; GFX11-NEXT: v_writelane_b32 v34, s87, 29
+; GFX11-NEXT: v_writelane_b32 v34, s96, 30
+; GFX11-NEXT: v_writelane_b32 v34, s97, 31
; GFX11-NEXT: s_cbranch_scc0 .LBB57_2
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s72, s18, 16
@@ -76988,13 +76988,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: .LBB57_5: ; %end
; GFX11-NEXT: v_mov_b32_e32 v1, 0xc0c0004
; GFX11-NEXT: v_readlane_b32 s73, v36, 7
-; GFX11-NEXT: v_readlane_b32 s31, v34, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_perm_b32 v2, s73, s30, v1
; GFX11-NEXT: v_readlane_b32 s73, v36, 8
-; GFX11-NEXT: v_readlane_b32 s30, v34, 0
+; GFX11-NEXT: v_readlane_b32 s30, v35, 7
+; GFX11-NEXT: v_readlane_b32 s31, v35, 8
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_perm_b32 v3, s40, s73, v1
; GFX11-NEXT: v_readlane_b32 s40, v36, 6
; GFX11-NEXT: v_perm_b32 v22, s77, s80, v1
@@ -77003,7 +77003,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-NEXT: v_perm_b32 v4, s41, s40, v1
; GFX11-NEXT: v_readlane_b32 s40, v36, 2
-; GFX11-NEXT: v_readlane_b32 s80, v34, 24
+; GFX11-NEXT: v_readlane_b32 s80, v34, 22
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, s40, s92, v1
; GFX11-NEXT: v_readlane_b32 s40, v36, 3
@@ -77057,7 +77057,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_perm_b32 v6, s20, s104, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v14
; GFX11-NEXT: v_perm_b32 v14, s3, s39, v1
-; GFX11-NEXT: v_readlane_b32 s104, v35, 8
+; GFX11-NEXT: v_readlane_b32 s104, v35, 6
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v21
@@ -77070,8 +77070,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v5, v13, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v15
; GFX11-NEXT: v_perm_b32 v10, s19, s97, v1
-; GFX11-NEXT: v_readlane_b32 s97, v35, 1
-; GFX11-NEXT: v_readlane_b32 s39, v34, 7
+; GFX11-NEXT: v_readlane_b32 s97, v34, 31
+; GFX11-NEXT: v_readlane_b32 s39, v34, 5
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off offset:16
; GFX11-NEXT: v_perm_b32 v5, s102, s101, v1
; GFX11-NEXT: v_or_b32_e32 v3, v17, v8
@@ -77116,8 +77116,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v12
; GFX11-NEXT: v_or_b32_e32 v3, v10, v11
; GFX11-NEXT: v_perm_b32 v10, s63, s67, v1
-; GFX11-NEXT: v_readlane_b32 s103, v35, 7
-; GFX11-NEXT: v_readlane_b32 s102, v35, 6
+; GFX11-NEXT: v_readlane_b32 s103, v35, 5
+; GFX11-NEXT: v_readlane_b32 s102, v35, 4
; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-NEXT: v_perm_b32 v5, s13, s62, v1
; GFX11-NEXT: v_readlane_b32 s12, v37, 6
@@ -77129,7 +77129,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v5, v5, v7
; GFX11-NEXT: v_readlane_b32 s11, v37, 9
; GFX11-NEXT: v_perm_b32 v12, s58, s10, v1
-; GFX11-NEXT: v_readlane_b32 s101, v35, 5
+; GFX11-NEXT: v_readlane_b32 s101, v35, 3
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v6
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off offset:64
@@ -77140,8 +77140,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v12
; GFX11-NEXT: v_or_b32_e32 v7, v10, v11
; GFX11-NEXT: v_perm_b32 v10, s56, s57, v1
-; GFX11-NEXT: v_readlane_b32 s100, v35, 4
-; GFX11-NEXT: v_readlane_b32 s99, v35, 3
+; GFX11-NEXT: v_readlane_b32 s100, v35, 2
+; GFX11-NEXT: v_readlane_b32 s99, v35, 1
; GFX11-NEXT: v_or_b32_e32 v8, v8, v9
; GFX11-NEXT: v_perm_b32 v9, s9, s55, v1
; GFX11-NEXT: v_readlane_b32 s8, v37, 10
@@ -77153,12 +77153,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v9, v9, v3
; GFX11-NEXT: v_readlane_b32 s7, v37, 13
; GFX11-NEXT: v_perm_b32 v12, s51, s6, v1
-; GFX11-NEXT: v_readlane_b32 s98, v35, 2
+; GFX11-NEXT: v_readlane_b32 s98, v35, 0
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-NEXT: v_readlane_b32 s96, v35, 0
-; GFX11-NEXT: v_readlane_b32 s87, v34, 31
-; GFX11-NEXT: v_readlane_b32 s86, v34, 30
+; GFX11-NEXT: v_readlane_b32 s96, v34, 30
+; GFX11-NEXT: v_readlane_b32 s87, v34, 29
+; GFX11-NEXT: v_readlane_b32 s86, v34, 28
; GFX11-NEXT: v_or_b32_e32 v2, v4, v5
; GFX11-NEXT: v_perm_b32 v4, s4, s50, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v12
@@ -77172,51 +77172,51 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_readlane_b32 s5, v37, 15
; GFX11-NEXT: v_readlane_b32 s3, v37, 17
-; GFX11-NEXT: v_readlane_b32 s85, v34, 29
+; GFX11-NEXT: v_readlane_b32 s85, v34, 27
; GFX11-NEXT: v_perm_b32 v11, s42, s4, v1
; GFX11-NEXT: v_or_b32_e32 v5, v5, v10
; GFX11-NEXT: v_perm_b32 v10, s48, s38, v1
-; GFX11-NEXT: v_readlane_b32 s84, v34, 28
-; GFX11-NEXT: v_readlane_b32 s83, v34, 27
+; GFX11-NEXT: v_readlane_b32 s84, v34, 26
+; GFX11-NEXT: v_readlane_b32 s83, v34, 25
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_readlane_b32 s82, v34, 26
+; GFX11-NEXT: v_readlane_b32 s82, v34, 24
; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_readlane_b32 s81, v34, 25
-; GFX11-NEXT: v_readlane_b32 s71, v34, 23
+; GFX11-NEXT: v_readlane_b32 s81, v34, 23
+; GFX11-NEXT: v_readlane_b32 s71, v34, 21
; GFX11-NEXT: v_or_b32_e32 v10, v12, v11
; GFX11-NEXT: v_perm_b32 v12, s36, s2, v1
; GFX11-NEXT: v_or_b32_e32 v11, v14, v13
; GFX11-NEXT: v_perm_b32 v13, s0, s37, v1
-; GFX11-NEXT: v_readlane_b32 s70, v34, 22
-; GFX11-NEXT: v_readlane_b32 s69, v34, 21
+; GFX11-NEXT: v_readlane_b32 s70, v34, 20
+; GFX11-NEXT: v_readlane_b32 s69, v34, 19
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_readlane_b32 s68, v34, 20
-; GFX11-NEXT: v_readlane_b32 s67, v34, 19
-; GFX11-NEXT: v_readlane_b32 s66, v34, 18
-; GFX11-NEXT: v_readlane_b32 s65, v34, 17
+; GFX11-NEXT: v_readlane_b32 s68, v34, 18
+; GFX11-NEXT: v_readlane_b32 s67, v34, 17
+; GFX11-NEXT: v_readlane_b32 s66, v34, 16
+; GFX11-NEXT: v_readlane_b32 s65, v34, 15
; GFX11-NEXT: v_or_b32_e32 v12, v13, v12
; GFX11-NEXT: v_perm_b32 v13, s34, vcc_hi, v1
; GFX11-NEXT: v_perm_b32 v1, s1, s35, v1
-; GFX11-NEXT: v_readlane_b32 s64, v34, 16
-; GFX11-NEXT: v_readlane_b32 s55, v34, 15
-; GFX11-NEXT: v_readlane_b32 s54, v34, 14
+; GFX11-NEXT: v_readlane_b32 s64, v34, 14
+; GFX11-NEXT: v_readlane_b32 s55, v34, 13
+; GFX11-NEXT: v_readlane_b32 s54, v34, 12
; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT: v_readlane_b32 s53, v34, 13
-; GFX11-NEXT: v_readlane_b32 s52, v34, 12
-; GFX11-NEXT: v_readlane_b32 s51, v34, 11
-; GFX11-NEXT: v_readlane_b32 s50, v34, 10
+; GFX11-NEXT: v_readlane_b32 s53, v34, 11
+; GFX11-NEXT: v_readlane_b32 s52, v34, 10
+; GFX11-NEXT: v_readlane_b32 s51, v34, 9
+; GFX11-NEXT: v_readlane_b32 s50, v34, 8
; GFX11-NEXT: v_or_b32_e32 v13, v1, v13
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:112
-; GFX11-NEXT: v_readlane_b32 s49, v34, 9
-; GFX11-NEXT: v_readlane_b32 s48, v34, 8
-; GFX11-NEXT: v_readlane_b32 s38, v34, 6
-; GFX11-NEXT: v_readlane_b32 s37, v34, 5
-; GFX11-NEXT: v_readlane_b32 s36, v34, 4
-; GFX11-NEXT: v_readlane_b32 s35, v34, 3
-; GFX11-NEXT: v_readlane_b32 s34, v34, 2
+; GFX11-NEXT: v_readlane_b32 s49, v34, 7
+; GFX11-NEXT: v_readlane_b32 s48, v34, 6
+; GFX11-NEXT: v_readlane_b32 s38, v34, 4
+; GFX11-NEXT: v_readlane_b32 s37, v34, 3
+; GFX11-NEXT: v_readlane_b32 s36, v34, 2
+; GFX11-NEXT: v_readlane_b32 s35, v34, 1
+; GFX11-NEXT: v_readlane_b32 s34, v34, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v34, off, s32
@@ -88289,70 +88289,69 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v33, s30, 0
-; SI-NEXT: v_writelane_b32 v33, s31, 1
-; SI-NEXT: v_writelane_b32 v33, s34, 2
-; SI-NEXT: v_writelane_b32 v33, s35, 3
-; SI-NEXT: v_writelane_b32 v33, s36, 4
-; SI-NEXT: v_writelane_b32 v33, s37, 5
-; SI-NEXT: v_writelane_b32 v33, s38, 6
-; SI-NEXT: v_writelane_b32 v33, s39, 7
-; SI-NEXT: v_writelane_b32 v33, s48, 8
-; SI-NEXT: v_writelane_b32 v33, s49, 9
-; SI-NEXT: v_writelane_b32 v33, s50, 10
-; SI-NEXT: v_writelane_b32 v33, s51, 11
-; SI-NEXT: v_writelane_b32 v33, s52, 12
-; SI-NEXT: v_writelane_b32 v33, s53, 13
-; SI-NEXT: v_writelane_b32 v33, s54, 14
-; SI-NEXT: v_writelane_b32 v33, s55, 15
-; SI-NEXT: v_writelane_b32 v33, s64, 16
-; SI-NEXT: v_writelane_b32 v33, s65, 17
-; SI-NEXT: v_writelane_b32 v33, s66, 18
-; SI-NEXT: v_writelane_b32 v33, s67, 19
-; SI-NEXT: v_writelane_b32 v33, s68, 20
-; SI-NEXT: v_writelane_b32 v33, s69, 21
+; SI-NEXT: v_writelane_b32 v33, s34, 0
+; SI-NEXT: v_writelane_b32 v33, s35, 1
+; SI-NEXT: v_writelane_b32 v33, s36, 2
+; SI-NEXT: v_writelane_b32 v33, s37, 3
+; SI-NEXT: v_writelane_b32 v33, s38, 4
+; SI-NEXT: v_writelane_b32 v33, s39, 5
+; SI-NEXT: v_writelane_b32 v33, s48, 6
+; SI-NEXT: v_writelane_b32 v33, s49, 7
+; SI-NEXT: v_writelane_b32 v33, s50, 8
+; SI-NEXT: v_writelane_b32 v33, s51, 9
+; SI-NEXT: v_writelane_b32 v33, s52, 10
+; SI-NEXT: v_writelane_b32 v33, s53, 11
+; SI-NEXT: v_writelane_b32 v33, s54, 12
+; SI-NEXT: v_writelane_b32 v33, s55, 13
+; SI-NEXT: v_writelane_b32 v33, s64, 14
+; SI-NEXT: v_writelane_b32 v33, s65, 15
+; SI-NEXT: v_writelane_b32 v33, s66, 16
+; SI-NEXT: v_writelane_b32 v33, s67, 17
+; SI-NEXT: v_writelane_b32 v33, s68, 18
+; SI-NEXT: v_writelane_b32 v33, s69, 19
+; SI-NEXT: v_writelane_b32 v33, s70, 20
; SI-NEXT: v_mov_b32_e32 v19, s16
-; SI-NEXT: v_writelane_b32 v33, s70, 22
+; SI-NEXT: v_writelane_b32 v33, s71, 21
; SI-NEXT: v_readfirstlane_b32 s48, v19
; SI-NEXT: v_mov_b32_e32 v19, s17
-; SI-NEXT: v_writelane_b32 v33, s71, 23
+; SI-NEXT: v_writelane_b32 v33, s80, 22
; SI-NEXT: v_readfirstlane_b32 s49, v19
; SI-NEXT: v_mov_b32_e32 v19, s18
-; SI-NEXT: v_writelane_b32 v33, s80, 24
+; SI-NEXT: v_writelane_b32 v33, s81, 23
; SI-NEXT: v_readfirstlane_b32 s50, v19
; SI-NEXT: v_mov_b32_e32 v19, s19
-; SI-NEXT: v_writelane_b32 v33, s81, 25
+; SI-NEXT: v_writelane_b32 v33, s82, 24
; SI-NEXT: v_readfirstlane_b32 s51, v19
; SI-NEXT: v_mov_b32_e32 v19, s20
-; SI-NEXT: v_writelane_b32 v33, s82, 26
+; SI-NEXT: v_writelane_b32 v33, s83, 25
; SI-NEXT: v_readfirstlane_b32 s52, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
-; SI-NEXT: v_writelane_b32 v33, s83, 27
+; SI-NEXT: v_writelane_b32 v33, s84, 26
; SI-NEXT: v_readfirstlane_b32 s53, v19
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v33, s84, 28
+; SI-NEXT: v_writelane_b32 v33, s85, 27
; SI-NEXT: v_readfirstlane_b32 s54, v19
; SI-NEXT: v_mov_b32_e32 v19, s23
-; SI-NEXT: v_writelane_b32 v33, s85, 29
+; SI-NEXT: v_writelane_b32 v33, s86, 28
; SI-NEXT: v_readfirstlane_b32 s55, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
-; SI-NEXT: v_writelane_b32 v33, s86, 30
+; SI-NEXT: v_writelane_b32 v33, s87, 29
; SI-NEXT: v_readfirstlane_b32 s64, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
-; SI-NEXT: v_writelane_b32 v33, s87, 31
+; SI-NEXT: v_writelane_b32 v33, s96, 30
; SI-NEXT: v_readfirstlane_b32 s65, v19
; SI-NEXT: v_mov_b32_e32 v19, s26
-; SI-NEXT: v_writelane_b32 v33, s96, 32
+; SI-NEXT: v_writelane_b32 v33, s97, 31
; SI-NEXT: v_readfirstlane_b32 s66, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
-; SI-NEXT: v_writelane_b32 v33, s97, 33
+; SI-NEXT: v_writelane_b32 v33, s98, 32
; SI-NEXT: v_readfirstlane_b32 s67, v19
; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_writelane_b32 v33, s98, 34
+; SI-NEXT: v_writelane_b32 v33, s99, 33
; SI-NEXT: v_readfirstlane_b32 s68, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT: v_writelane_b32 v33, s99, 35
+; SI-NEXT: v_writelane_b32 v33, s30, 34
; SI-NEXT: v_readfirstlane_b32 s69, v19
; SI-NEXT: v_readfirstlane_b32 s70, v0
; SI-NEXT: v_readfirstlane_b32 s71, v1
@@ -88373,6 +88372,7 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: v_readfirstlane_b32 s8, v16
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s9, v17
+; SI-NEXT: v_writelane_b32 v33, s31, 35
; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB61_4
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -88680,43 +88680,43 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: v_readlane_b32 s4, v34, 1
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31
; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT: v_readlane_b32 s30, v33, 34
; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT: v_readlane_b32 s99, v33, 35
-; SI-NEXT: v_readlane_b32 s98, v33, 34
-; SI-NEXT: v_readlane_b32 s97, v33, 33
-; SI-NEXT: v_readlane_b32 s96, v33, 32
-; SI-NEXT: v_readlane_b32 s87, v33, 31
-; SI-NEXT: v_readlane_b32 s86, v33, 30
-; SI-NEXT: v_readlane_b32 s85, v33, 29
-; SI-NEXT: v_readlane_b32 s84, v33, 28
-; SI-NEXT: v_readlane_b32 s83, v33, 27
-; SI-NEXT: v_readlane_b32 s82, v33, 26
-; SI-NEXT: v_readlane_b32 s81, v33, 25
-; SI-NEXT: v_readlane_b32 s80, v33, 24
-; SI-NEXT: v_readlane_b32 s71, v33, 23
-; SI-NEXT: v_readlane_b32 s70, v33, 22
-; SI-NEXT: v_readlane_b32 s69, v33, 21
-; SI-NEXT: v_readlane_b32 s68, v33, 20
-; SI-NEXT: v_readlane_b32 s67, v33, 19
-; SI-NEXT: v_readlane_b32 s66, v33, 18
-; SI-NEXT: v_readlane_b32 s65, v33, 17
-; SI-NEXT: v_readlane_b32 s64, v33, 16
-; SI-NEXT: v_readlane_b32 s55, v33, 15
-; SI-NEXT: v_readlane_b32 s54, v33, 14
-; SI-NEXT: v_readlane_b32 s53, v33, 13
-; SI-NEXT: v_readlane_b32 s52, v33, 12
-; SI-NEXT: v_readlane_b32 s51, v33, 11
-; SI-NEXT: v_readlane_b32 s50, v33, 10
-; SI-NEXT: v_readlane_b32 s49, v33, 9
-; SI-NEXT: v_readlane_b32 s48, v33, 8
-; SI-NEXT: v_readlane_b32 s39, v33, 7
-; SI-NEXT: v_readlane_b32 s38, v33, 6
-; SI-NEXT: v_readlane_b32 s37, v33, 5
-; SI-NEXT: v_readlane_b32 s36, v33, 4
-; SI-NEXT: v_readlane_b32 s35, v33, 3
-; SI-NEXT: v_readlane_b32 s34, v33, 2
-; SI-NEXT: v_readlane_b32 s31, v33, 1
-; SI-NEXT: v_readlane_b32 s30, v33, 0
+; SI-NEXT: v_readlane_b32 s31, v33, 35
+; SI-NEXT: v_readlane_b32 s99, v33, 33
+; SI-NEXT: v_readlane_b32 s98, v33, 32
+; SI-NEXT: v_readlane_b32 s97, v33, 31
+; SI-NEXT: v_readlane_b32 s96, v33, 30
+; SI-NEXT: v_readlane_b32 s87, v33, 29
+; SI-NEXT: v_readlane_b32 s86, v33, 28
+; SI-NEXT: v_readlane_b32 s85, v33, 27
+; SI-NEXT: v_readlane_b32 s84, v33, 26
+; SI-NEXT: v_readlane_b32 s83, v33, 25
+; SI-NEXT: v_readlane_b32 s82, v33, 24
+; SI-NEXT: v_readlane_b32 s81, v33, 23
+; SI-NEXT: v_readlane_b32 s80, v33, 22
+; SI-NEXT: v_readlane_b32 s71, v33, 21
+; SI-NEXT: v_readlane_b32 s70, v33, 20
+; SI-NEXT: v_readlane_b32 s69, v33, 19
+; SI-NEXT: v_readlane_b32 s68, v33, 18
+; SI-NEXT: v_readlane_b32 s67, v33, 17
+; SI-NEXT: v_readlane_b32 s66, v33, 16
+; SI-NEXT: v_readlane_b32 s65, v33, 15
+; SI-NEXT: v_readlane_b32 s64, v33, 14
+; SI-NEXT: v_readlane_b32 s55, v33, 13
+; SI-NEXT: v_readlane_b32 s54, v33, 12
+; SI-NEXT: v_readlane_b32 s53, v33, 11
+; SI-NEXT: v_readlane_b32 s52, v33, 10
+; SI-NEXT: v_readlane_b32 s51, v33, 9
+; SI-NEXT: v_readlane_b32 s50, v33, 8
+; SI-NEXT: v_readlane_b32 s49, v33, 7
+; SI-NEXT: v_readlane_b32 s48, v33, 6
+; SI-NEXT: v_readlane_b32 s39, v33, 5
+; SI-NEXT: v_readlane_b32 s38, v33, 4
+; SI-NEXT: v_readlane_b32 s37, v33, 3
+; SI-NEXT: v_readlane_b32 s36, v33, 2
+; SI-NEXT: v_readlane_b32 s35, v33, 1
+; SI-NEXT: v_readlane_b32 s34, v33, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -95987,55 +95987,55 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v32, s30, 0
-; SI-NEXT: v_writelane_b32 v32, s31, 1
-; SI-NEXT: v_writelane_b32 v32, s34, 2
-; SI-NEXT: v_writelane_b32 v32, s35, 3
-; SI-NEXT: v_writelane_b32 v32, s36, 4
-; SI-NEXT: v_writelane_b32 v32, s37, 5
-; SI-NEXT: v_writelane_b32 v32, s38, 6
+; SI-NEXT: v_writelane_b32 v32, s34, 0
+; SI-NEXT: v_writelane_b32 v32, s35, 1
+; SI-NEXT: v_writelane_b32 v32, s36, 2
+; SI-NEXT: v_writelane_b32 v32, s37, 3
+; SI-NEXT: v_writelane_b32 v32, s38, 4
+; SI-NEXT: v_writelane_b32 v32, s39, 5
+; SI-NEXT: v_writelane_b32 v32, s48, 6
; SI-NEXT: v_mov_b32_e32 v19, s16
-; SI-NEXT: v_writelane_b32 v32, s39, 7
+; SI-NEXT: v_writelane_b32 v32, s49, 7
; SI-NEXT: v_readfirstlane_b32 s56, v19
; SI-NEXT: v_mov_b32_e32 v19, s17
-; SI-NEXT: v_writelane_b32 v32, s48, 8
+; SI-NEXT: v_writelane_b32 v32, s50, 8
; SI-NEXT: v_readfirstlane_b32 s57, v19
; SI-NEXT: v_mov_b32_e32 v19, s18
-; SI-NEXT: v_writelane_b32 v32, s49, 9
+; SI-NEXT: v_writelane_b32 v32, s51, 9
; SI-NEXT: v_readfirstlane_b32 s46, v19
; SI-NEXT: v_mov_b32_e32 v19, s19
-; SI-NEXT: v_writelane_b32 v32, s50, 10
+; SI-NEXT: v_writelane_b32 v32, s52, 10
; SI-NEXT: v_readfirstlane_b32 s47, v19
; SI-NEXT: v_mov_b32_e32 v19, s20
-; SI-NEXT: v_writelane_b32 v32, s51, 11
+; SI-NEXT: v_writelane_b32 v32, s53, 11
; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
-; SI-NEXT: v_writelane_b32 v32, s52, 12
+; SI-NEXT: v_writelane_b32 v32, s54, 12
; SI-NEXT: v_readfirstlane_b32 s45, v19
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v32, s53, 13
+; SI-NEXT: v_writelane_b32 v32, s55, 13
; SI-NEXT: v_readfirstlane_b32 s42, v19
; SI-NEXT: v_mov_b32_e32 v19, s23
-; SI-NEXT: v_writelane_b32 v32, s54, 14
+; SI-NEXT: v_writelane_b32 v32, s64, 14
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
-; SI-NEXT: v_writelane_b32 v32, s55, 15
+; SI-NEXT: v_writelane_b32 v32, s65, 15
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
-; SI-NEXT: v_writelane_b32 v32, s64, 16
+; SI-NEXT: v_writelane_b32 v32, s66, 16
; SI-NEXT: v_readfirstlane_b32 s41, v19
; SI-NEXT: v_mov_b32_e32 v19, s26
-; SI-NEXT: v_writelane_b32 v32, s65, 17
+; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
-; SI-NEXT: v_writelane_b32 v32, s66, 18
+; SI-NEXT: v_writelane_b32 v32, s68, 18
; SI-NEXT: v_readfirstlane_b32 s25, v19
; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_writelane_b32 v32, s67, 19
+; SI-NEXT: v_writelane_b32 v32, s69, 19
; SI-NEXT: v_readfirstlane_b32 s22, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT: v_writelane_b32 v32, s68, 20
+; SI-NEXT: v_writelane_b32 v32, s30, 20
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v0
; SI-NEXT: v_readfirstlane_b32 s21, v1
@@ -96056,7 +96056,7 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v17
-; SI-NEXT: v_writelane_b32 v32, s69, 21
+; SI-NEXT: v_writelane_b32 v32, s31, 21
; SI-NEXT: s_cbranch_scc0 .LBB65_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 16
@@ -96254,6 +96254,7 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s56
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v32, 20
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s46
@@ -96286,28 +96287,27 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v29, s7
; SI-NEXT: v_mov_b32_e32 v30, s4
; SI-NEXT: v_mov_b32_e32 v31, s5
-; SI-NEXT: v_readlane_b32 s69, v32, 21
-; SI-NEXT: v_readlane_b32 s68, v32, 20
-; SI-NEXT: v_readlane_b32 s67, v32, 19
-; SI-NEXT: v_readlane_b32 s66, v32, 18
-; SI-NEXT: v_readlane_b32 s65, v32, 17
-; SI-NEXT: v_readlane_b32 s64, v32, 16
-; SI-NEXT: v_readlane_b32 s55, v32, 15
-; SI-NEXT: v_readlane_b32 s54, v32, 14
-; SI-NEXT: v_readlane_b32 s53, v32, 13
-; SI-NEXT: v_readlane_b32 s52, v32, 12
-; SI-NEXT: v_readlane_b32 s51, v32, 11
-; SI-NEXT: v_readlane_b32 s50, v32, 10
-; SI-NEXT: v_readlane_b32 s49, v32, 9
-; SI-NEXT: v_readlane_b32 s48, v32, 8
-; SI-NEXT: v_readlane_b32 s39, v32, 7
-; SI-NEXT: v_readlane_b32 s38, v32, 6
-; SI-NEXT: v_readlane_b32 s37, v32, 5
-; SI-NEXT: v_readlane_b32 s36, v32, 4
-; SI-NEXT: v_readlane_b32 s35, v32, 3
-; SI-NEXT: v_readlane_b32 s34, v32, 2
-; SI-NEXT: v_readlane_b32 s31, v32, 1
-; SI-NEXT: v_readlane_b32 s30, v32, 0
+; SI-NEXT: v_readlane_b32 s31, v32, 21
+; SI-NEXT: v_readlane_b32 s69, v32, 19
+; SI-NEXT: v_readlane_b32 s68, v32, 18
+; SI-NEXT: v_readlane_b32 s67, v32, 17
+; SI-NEXT: v_readlane_b32 s66, v32, 16
+; SI-NEXT: v_readlane_b32 s65, v32, 15
+; SI-NEXT: v_readlane_b32 s64, v32, 14
+; SI-NEXT: v_readlane_b32 s55, v32, 13
+; SI-NEXT: v_readlane_b32 s54, v32, 12
+; SI-NEXT: v_readlane_b32 s53, v32, 11
+; SI-NEXT: v_readlane_b32 s52, v32, 10
+; SI-NEXT: v_readlane_b32 s51, v32, 9
+; SI-NEXT: v_readlane_b32 s50, v32, 8
+; SI-NEXT: v_readlane_b32 s49, v32, 7
+; SI-NEXT: v_readlane_b32 s48, v32, 6
+; SI-NEXT: v_readlane_b32 s39, v32, 5
+; SI-NEXT: v_readlane_b32 s38, v32, 4
+; SI-NEXT: v_readlane_b32 s37, v32, 3
+; SI-NEXT: v_readlane_b32 s36, v32, 2
+; SI-NEXT: v_readlane_b32 s35, v32, 1
+; SI-NEXT: v_readlane_b32 s34, v32, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -98852,55 +98852,55 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v32, s30, 0
-; SI-NEXT: v_writelane_b32 v32, s31, 1
-; SI-NEXT: v_writelane_b32 v32, s34, 2
-; SI-NEXT: v_writelane_b32 v32, s35, 3
-; SI-NEXT: v_writelane_b32 v32, s36, 4
-; SI-NEXT: v_writelane_b32 v32, s37, 5
-; SI-NEXT: v_writelane_b32 v32, s38, 6
+; SI-NEXT: v_writelane_b32 v32, s34, 0
+; SI-NEXT: v_writelane_b32 v32, s35, 1
+; SI-NEXT: v_writelane_b32 v32, s36, 2
+; SI-NEXT: v_writelane_b32 v32, s37, 3
+; SI-NEXT: v_writelane_b32 v32, s38, 4
+; SI-NEXT: v_writelane_b32 v32, s39, 5
+; SI-NEXT: v_writelane_b32 v32, s48, 6
; SI-NEXT: v_mov_b32_e32 v19, s16
-; SI-NEXT: v_writelane_b32 v32, s39, 7
+; SI-NEXT: v_writelane_b32 v32, s49, 7
; SI-NEXT: v_readfirstlane_b32 s56, v19
; SI-NEXT: v_mov_b32_e32 v19, s17
-; SI-NEXT: v_writelane_b32 v32, s48, 8
+; SI-NEXT: v_writelane_b32 v32, s50, 8
; SI-NEXT: v_readfirstlane_b32 s57, v19
; SI-NEXT: v_mov_b32_e32 v19, s18
-; SI-NEXT: v_writelane_b32 v32, s49, 9
+; SI-NEXT: v_writelane_b32 v32, s51, 9
; SI-NEXT: v_readfirstlane_b32 s46, v19
; SI-NEXT: v_mov_b32_e32 v19, s19
-; SI-NEXT: v_writelane_b32 v32, s50, 10
+; SI-NEXT: v_writelane_b32 v32, s52, 10
; SI-NEXT: v_readfirstlane_b32 s47, v19
; SI-NEXT: v_mov_b32_e32 v19, s20
-; SI-NEXT: v_writelane_b32 v32, s51, 11
+; SI-NEXT: v_writelane_b32 v32, s53, 11
; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
-; SI-NEXT: v_writelane_b32 v32, s52, 12
+; SI-NEXT: v_writelane_b32 v32, s54, 12
; SI-NEXT: v_readfirstlane_b32 s45, v19
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v32, s53, 13
+; SI-NEXT: v_writelane_b32 v32, s55, 13
; SI-NEXT: v_readfirstlane_b32 s42, v19
; SI-NEXT: v_mov_b32_e32 v19, s23
-; SI-NEXT: v_writelane_b32 v32, s54, 14
+; SI-NEXT: v_writelane_b32 v32, s64, 14
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
-; SI-NEXT: v_writelane_b32 v32, s55, 15
+; SI-NEXT: v_writelane_b32 v32, s65, 15
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
-; SI-NEXT: v_writelane_b32 v32, s64, 16
+; SI-NEXT: v_writelane_b32 v32, s66, 16
; SI-NEXT: v_readfirstlane_b32 s41, v19
; SI-NEXT: v_mov_b32_e32 v19, s26
-; SI-NEXT: v_writelane_b32 v32, s65, 17
+; SI-NEXT: v_writelane_b32 v32, s67, 17
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
-; SI-NEXT: v_writelane_b32 v32, s66, 18
+; SI-NEXT: v_writelane_b32 v32, s68, 18
; SI-NEXT: v_readfirstlane_b32 s25, v19
; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_writelane_b32 v32, s67, 19
+; SI-NEXT: v_writelane_b32 v32, s69, 19
; SI-NEXT: v_readfirstlane_b32 s22, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT: v_writelane_b32 v32, s68, 20
+; SI-NEXT: v_writelane_b32 v32, s30, 20
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v0
; SI-NEXT: v_readfirstlane_b32 s21, v1
@@ -98921,7 +98921,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v16
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v17
-; SI-NEXT: v_writelane_b32 v32, s69, 21
+; SI-NEXT: v_writelane_b32 v32, s31, 21
; SI-NEXT: s_cbranch_scc0 .LBB69_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 16
@@ -99119,6 +99119,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s56
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v32, 20
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s46
@@ -99151,28 +99152,27 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v29, s7
; SI-NEXT: v_mov_b32_e32 v30, s4
; SI-NEXT: v_mov_b32_e32 v31, s5
-; SI-NEXT: v_readlane_b32 s69, v32, 21
-; SI-NEXT: v_readlane_b32 s68, v32, 20
-; SI-NEXT: v_readlane_b32 s67, v32, 19
-; SI-NEXT: v_readlane_b32 s66, v32, 18
-; SI-NEXT: v_readlane_b32 s65, v32, 17
-; SI-NEXT: v_readlane_b32 s64, v32, 16
-; SI-NEXT: v_readlane_b32 s55, v32, 15
-; SI-NEXT: v_readlane_b32 s54, v32, 14
-; SI-NEXT: v_readlane_b32 s53, v32, 13
-; SI-NEXT: v_readlane_b32 s52, v32, 12
-; SI-NEXT: v_readlane_b32 s51, v32, 11
-; SI-NEXT: v_readlane_b32 s50, v32, 10
-; SI-NEXT: v_readlane_b32 s49, v32, 9
-; SI-NEXT: v_readlane_b32 s48, v32, 8
-; SI-NEXT: v_readlane_b32 s39, v32, 7
-; SI-NEXT: v_readlane_b32 s38, v32, 6
-; SI-NEXT: v_readlane_b32 s37, v32, 5
-; SI-NEXT: v_readlane_b32 s36, v32, 4
-; SI-NEXT: v_readlane_b32 s35, v32, 3
-; SI-NEXT: v_readlane_b32 s34, v32, 2
-; SI-NEXT: v_readlane_b32 s31, v32, 1
-; SI-NEXT: v_readlane_b32 s30, v32, 0
+; SI-NEXT: v_readlane_b32 s31, v32, 21
+; SI-NEXT: v_readlane_b32 s69, v32, 19
+; SI-NEXT: v_readlane_b32 s68, v32, 18
+; SI-NEXT: v_readlane_b32 s67, v32, 17
+; SI-NEXT: v_readlane_b32 s66, v32, 16
+; SI-NEXT: v_readlane_b32 s65, v32, 15
+; SI-NEXT: v_readlane_b32 s64, v32, 14
+; SI-NEXT: v_readlane_b32 s55, v32, 13
+; SI-NEXT: v_readlane_b32 s54, v32, 12
+; SI-NEXT: v_readlane_b32 s53, v32, 11
+; SI-NEXT: v_readlane_b32 s52, v32, 10
+; SI-NEXT: v_readlane_b32 s51, v32, 9
+; SI-NEXT: v_readlane_b32 s50, v32, 8
+; SI-NEXT: v_readlane_b32 s49, v32, 7
+; SI-NEXT: v_readlane_b32 s48, v32, 6
+; SI-NEXT: v_readlane_b32 s39, v32, 5
+; SI-NEXT: v_readlane_b32 s38, v32, 4
+; SI-NEXT: v_readlane_b32 s37, v32, 3
+; SI-NEXT: v_readlane_b32 s36, v32, 2
+; SI-NEXT: v_readlane_b32 s35, v32, 1
+; SI-NEXT: v_readlane_b32 s34, v32, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -104931,70 +104931,70 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v63, s30, 0
-; SI-NEXT: v_writelane_b32 v63, s31, 1
-; SI-NEXT: v_writelane_b32 v63, s34, 2
-; SI-NEXT: v_writelane_b32 v63, s35, 3
-; SI-NEXT: v_writelane_b32 v63, s36, 4
-; SI-NEXT: v_writelane_b32 v63, s37, 5
-; SI-NEXT: v_writelane_b32 v63, s38, 6
-; SI-NEXT: v_writelane_b32 v63, s39, 7
-; SI-NEXT: v_writelane_b32 v63, s48, 8
-; SI-NEXT: v_writelane_b32 v63, s49, 9
-; SI-NEXT: v_writelane_b32 v63, s50, 10
-; SI-NEXT: v_writelane_b32 v63, s51, 11
-; SI-NEXT: v_writelane_b32 v63, s52, 12
-; SI-NEXT: v_writelane_b32 v63, s53, 13
-; SI-NEXT: v_writelane_b32 v63, s54, 14
-; SI-NEXT: v_writelane_b32 v63, s55, 15
-; SI-NEXT: v_writelane_b32 v63, s64, 16
-; SI-NEXT: v_writelane_b32 v63, s65, 17
-; SI-NEXT: v_writelane_b32 v63, s66, 18
-; SI-NEXT: v_writelane_b32 v63, s67, 19
-; SI-NEXT: v_writelane_b32 v63, s68, 20
-; SI-NEXT: v_writelane_b32 v63, s69, 21
+; SI-NEXT: v_writelane_b32 v63, s34, 0
+; SI-NEXT: v_writelane_b32 v63, s35, 1
+; SI-NEXT: v_writelane_b32 v63, s36, 2
+; SI-NEXT: v_writelane_b32 v63, s37, 3
+; SI-NEXT: v_writelane_b32 v63, s38, 4
+; SI-NEXT: v_writelane_b32 v63, s39, 5
+; SI-NEXT: v_writelane_b32 v63, s48, 6
+; SI-NEXT: v_writelane_b32 v63, s49, 7
+; SI-NEXT: v_writelane_b32 v63, s50, 8
+; SI-NEXT: v_writelane_b32 v63, s51, 9
+; SI-NEXT: v_writelane_b32 v63, s52, 10
+; SI-NEXT: v_writelane_b32 v63, s53, 11
+; SI-NEXT: v_writelane_b32 v63, s54, 12
+; SI-NEXT: v_writelane_b32 v63, s55, 13
+; SI-NEXT: v_writelane_b32 v63, s64, 14
+; SI-NEXT: v_writelane_b32 v63, s65, 15
+; SI-NEXT: v_writelane_b32 v63, s66, 16
+; SI-NEXT: v_writelane_b32 v63, s67, 17
+; SI-NEXT: v_writelane_b32 v63, s68, 18
+; SI-NEXT: v_writelane_b32 v63, s69, 19
+; SI-NEXT: v_writelane_b32 v63, s70, 20
+; SI-NEXT: v_writelane_b32 v63, s71, 21
; SI-NEXT: v_mov_b32_e32 v20, s16
-; SI-NEXT: v_writelane_b32 v63, s70, 22
+; SI-NEXT: v_writelane_b32 v63, s80, 22
; SI-NEXT: v_readfirstlane_b32 s56, v20
; SI-NEXT: v_mov_b32_e32 v20, s17
-; SI-NEXT: v_writelane_b32 v63, s71, 23
+; SI-NEXT: v_writelane_b32 v63, s81, 23
; SI-NEXT: v_readfirstlane_b32 s57, v20
; SI-NEXT: v_mov_b32_e32 v20, s18
-; SI-NEXT: v_writelane_b32 v63, s80, 24
+; SI-NEXT: v_writelane_b32 v63, s82, 24
; SI-NEXT: v_readfirstlane_b32 s46, v20
; SI-NEXT: v_mov_b32_e32 v20, s19
-; SI-NEXT: v_writelane_b32 v63, s81, 25
+; SI-NEXT: v_writelane_b32 v63, s83, 25
; SI-NEXT: v_readfirstlane_b32 s47, v20
; SI-NEXT: v_mov_b32_e32 v20, s20
-; SI-NEXT: v_writelane_b32 v63, s82, 26
+; SI-NEXT: v_writelane_b32 v63, s84, 26
; SI-NEXT: v_readfirstlane_b32 s44, v20
; SI-NEXT: v_mov_b32_e32 v20, s21
-; SI-NEXT: v_writelane_b32 v63, s83, 27
+; SI-NEXT: v_writelane_b32 v63, s85, 27
; SI-NEXT: v_readfirstlane_b32 s45, v20
; SI-NEXT: v_mov_b32_e32 v20, s22
-; SI-NEXT: v_writelane_b32 v63, s84, 28
+; SI-NEXT: v_writelane_b32 v63, s86, 28
; SI-NEXT: v_readfirstlane_b32 s42, v20
; SI-NEXT: v_mov_b32_e32 v20, s23
-; SI-NEXT: v_writelane_b32 v63, s85, 29
+; SI-NEXT: v_writelane_b32 v63, s87, 29
; SI-NEXT: v_readfirstlane_b32 s43, v20
; SI-NEXT: v_mov_b32_e32 v20, s24
-; SI-NEXT: v_writelane_b32 v63, s86, 30
+; SI-NEXT: v_writelane_b32 v63, s96, 30
; SI-NEXT: v_readfirstlane_b32 s40, v20
; SI-NEXT: v_mov_b32_e32 v20, s25
-; SI-NEXT: v_writelane_b32 v63, s87, 31
+; SI-NEXT: v_writelane_b32 v63, s97, 31
; SI-NEXT: v_readfirstlane_b32 s41, v20
; SI-NEXT: v_mov_b32_e32 v20, s26
-; SI-NEXT: v_writelane_b32 v63, s96, 32
+; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_readfirstlane_b32 s24, v20
; SI-NEXT: v_mov_b32_e32 v20, s27
-; SI-NEXT: v_writelane_b32 v63, s97, 33
+; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_readfirstlane_b32 s25, v20
; SI-NEXT: v_mov_b32_e32 v20, s28
-; SI-NEXT: v_writelane_b32 v63, s98, 34
+; SI-NEXT: v_writelane_b32 v63, s30, 34
; SI-NEXT: v_readfirstlane_b32 s22, v20
; SI-NEXT: v_mov_b32_e32 v20, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT: v_writelane_b32 v63, s99, 35
+; SI-NEXT: v_writelane_b32 v63, s31, 35
; SI-NEXT: v_readfirstlane_b32 s23, v20
; SI-NEXT: v_readfirstlane_b32 s20, v1
; SI-NEXT: v_readfirstlane_b32 s21, v2
@@ -105860,38 +105860,38 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: s_lshl_b32 s6, s37, 8
; SI-NEXT: s_lshl_b32 s8, s35, 24
-; SI-NEXT: v_readlane_b32 s99, v63, 35
-; SI-NEXT: v_readlane_b32 s98, v63, 34
-; SI-NEXT: v_readlane_b32 s97, v63, 33
-; SI-NEXT: v_readlane_b32 s96, v63, 32
-; SI-NEXT: v_readlane_b32 s87, v63, 31
-; SI-NEXT: v_readlane_b32 s86, v63, 30
-; SI-NEXT: v_readlane_b32 s85, v63, 29
-; SI-NEXT: v_readlane_b32 s84, v63, 28
-; SI-NEXT: v_readlane_b32 s83, v63, 27
-; SI-NEXT: v_readlane_b32 s82, v63, 26
-; SI-NEXT: v_readlane_b32 s81, v63, 25
-; SI-NEXT: v_readlane_b32 s80, v63, 24
-; SI-NEXT: v_readlane_b32 s71, v63, 23
-; SI-NEXT: v_readlane_b32 s70, v63, 22
-; SI-NEXT: v_readlane_b32 s69, v63, 21
-; SI-NEXT: v_readlane_b32 s68, v63, 20
-; SI-NEXT: v_readlane_b32 s67, v63, 19
-; SI-NEXT: v_readlane_b32 s66, v63, 18
-; SI-NEXT: v_readlane_b32 s65, v63, 17
-; SI-NEXT: v_readlane_b32 s64, v63, 16
-; SI-NEXT: v_readlane_b32 s55, v63, 15
-; SI-NEXT: v_readlane_b32 s54, v63, 14
-; SI-NEXT: v_readlane_b32 s53, v63, 13
-; SI-NEXT: v_readlane_b32 s52, v63, 12
-; SI-NEXT: v_readlane_b32 s51, v63, 11
-; SI-NEXT: v_readlane_b32 s50, v63, 10
-; SI-NEXT: v_readlane_b32 s49, v63, 9
-; SI-NEXT: v_readlane_b32 s48, v63, 8
-; SI-NEXT: v_readlane_b32 s39, v63, 7
-; SI-NEXT: v_readlane_b32 s38, v63, 6
-; SI-NEXT: v_readlane_b32 s37, v63, 5
-; SI-NEXT: v_readlane_b32 s35, v63, 3
+; SI-NEXT: v_readlane_b32 s99, v63, 33
+; SI-NEXT: v_readlane_b32 s98, v63, 32
+; SI-NEXT: v_readlane_b32 s97, v63, 31
+; SI-NEXT: v_readlane_b32 s96, v63, 30
+; SI-NEXT: v_readlane_b32 s87, v63, 29
+; SI-NEXT: v_readlane_b32 s86, v63, 28
+; SI-NEXT: v_readlane_b32 s85, v63, 27
+; SI-NEXT: v_readlane_b32 s84, v63, 26
+; SI-NEXT: v_readlane_b32 s83, v63, 25
+; SI-NEXT: v_readlane_b32 s82, v63, 24
+; SI-NEXT: v_readlane_b32 s81, v63, 23
+; SI-NEXT: v_readlane_b32 s80, v63, 22
+; SI-NEXT: v_readlane_b32 s71, v63, 21
+; SI-NEXT: v_readlane_b32 s70, v63, 20
+; SI-NEXT: v_readlane_b32 s69, v63, 19
+; SI-NEXT: v_readlane_b32 s68, v63, 18
+; SI-NEXT: v_readlane_b32 s67, v63, 17
+; SI-NEXT: v_readlane_b32 s66, v63, 16
+; SI-NEXT: v_readlane_b32 s65, v63, 15
+; SI-NEXT: v_readlane_b32 s64, v63, 14
+; SI-NEXT: v_readlane_b32 s55, v63, 13
+; SI-NEXT: v_readlane_b32 s54, v63, 12
+; SI-NEXT: v_readlane_b32 s53, v63, 11
+; SI-NEXT: v_readlane_b32 s52, v63, 10
+; SI-NEXT: v_readlane_b32 s51, v63, 9
+; SI-NEXT: v_readlane_b32 s50, v63, 8
+; SI-NEXT: v_readlane_b32 s49, v63, 7
+; SI-NEXT: v_readlane_b32 s48, v63, 6
+; SI-NEXT: v_readlane_b32 s39, v63, 5
+; SI-NEXT: v_readlane_b32 s38, v63, 4
+; SI-NEXT: v_readlane_b32 s37, v63, 3
+; SI-NEXT: v_readlane_b32 s35, v63, 1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v10
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -105924,9 +105924,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: s_lshl_b32 s6, s34, 8
; SI-NEXT: s_lshl_b32 s8, s30, 24
-; SI-NEXT: v_readlane_b32 s36, v63, 4
-; SI-NEXT: v_readlane_b32 s34, v63, 2
-; SI-NEXT: v_readlane_b32 s30, v63, 0
+; SI-NEXT: v_readlane_b32 s36, v63, 2
+; SI-NEXT: v_readlane_b32 s34, v63, 0
; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v18
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -105960,7 +105959,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: s_lshl_b32 s6, s95, 8
; SI-NEXT: s_lshl_b32 s8, s93, 24
-; SI-NEXT: v_readlane_b32 s31, v63, 1
+; SI-NEXT: v_readlane_b32 s30, v63, 34
+; SI-NEXT: v_readlane_b32 s31, v63, 35
; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v55
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
@@ -108181,72 +108181,72 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v74, s30, 0
-; GFX11-NEXT: v_writelane_b32 v75, s96, 0
+; GFX11-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-NEXT: v_writelane_b32 v75, s98, 0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: v_writelane_b32 v74, s31, 1
-; GFX11-NEXT: v_writelane_b32 v75, s97, 1
+; GFX11-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-NEXT: v_writelane_b32 v75, s99, 1
; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT: v_writelane_b32 v74, s34, 2
-; GFX11-NEXT: v_writelane_b32 v75, s98, 2
+; GFX11-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-NEXT: v_writelane_b32 v75, s100, 2
; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21
; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23
-; GFX11-NEXT: v_writelane_b32 v74, s35, 3
-; GFX11-NEXT: v_writelane_b32 v75, s99, 3
+; GFX11-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-NEXT: v_writelane_b32 v75, s101, 3
; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25
; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27
-; GFX11-NEXT: v_writelane_b32 v74, s36, 4
-; GFX11-NEXT: v_writelane_b32 v75, s100, 4
+; GFX11-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-NEXT: v_writelane_b32 v75, s102, 4
; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT: v_writelane_b32 v74, s37, 5
-; GFX11-NEXT: v_writelane_b32 v75, s101, 5
+; GFX11-NEXT: v_writelane_b32 v74, s39, 5
+; GFX11-NEXT: v_writelane_b32 v75, s103, 5
; GFX11-NEXT: v_readfirstlane_b32 s0, v16
; GFX11-NEXT: v_readfirstlane_b32 s1, v17
; GFX11-NEXT: v_readfirstlane_b32 s2, v18
-; GFX11-NEXT: v_writelane_b32 v74, s38, 6
-; GFX11-NEXT: v_writelane_b32 v75, s102, 6
+; GFX11-NEXT: v_writelane_b32 v74, s48, 6
+; GFX11-NEXT: v_writelane_b32 v75, s104, 6
; GFX11-NEXT: v_readfirstlane_b32 s3, v19
; GFX11-NEXT: v_readfirstlane_b32 s4, v20
; GFX11-NEXT: v_readfirstlane_b32 s5, v21
-; GFX11-NEXT: v_writelane_b32 v74, s39, 7
-; GFX11-NEXT: v_writelane_b32 v75, s103, 7
+; GFX11-NEXT: v_writelane_b32 v74, s49, 7
+; GFX11-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-NEXT: v_readfirstlane_b32 s6, v22
; GFX11-NEXT: v_readfirstlane_b32 s7, v23
; GFX11-NEXT: v_readfirstlane_b32 s8, v24
-; GFX11-NEXT: v_writelane_b32 v74, s48, 8
+; GFX11-NEXT: v_writelane_b32 v74, s50, 8
; GFX11-NEXT: v_readfirstlane_b32 s9, v25
; GFX11-NEXT: v_readfirstlane_b32 s10, v26
; GFX11-NEXT: v_readfirstlane_b32 s11, v27
; GFX11-NEXT: v_readfirstlane_b32 s12, v28
-; GFX11-NEXT: v_writelane_b32 v74, s49, 9
+; GFX11-NEXT: v_writelane_b32 v74, s51, 9
; GFX11-NEXT: v_readfirstlane_b32 s13, v29
; GFX11-NEXT: v_readfirstlane_b32 s14, v30
; GFX11-NEXT: v_readfirstlane_b32 s15, v31
; GFX11-NEXT: v_readfirstlane_b32 s16, v32
-; GFX11-NEXT: v_writelane_b32 v74, s50, 10
+; GFX11-NEXT: v_writelane_b32 v74, s52, 10
; GFX11-NEXT: v_readfirstlane_b32 s17, v33
; GFX11-NEXT: v_readfirstlane_b32 s18, v1
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
; GFX11-NEXT: v_readfirstlane_b32 s20, v3
-; GFX11-NEXT: v_writelane_b32 v74, s51, 11
+; GFX11-NEXT: v_writelane_b32 v74, s53, 11
; GFX11-NEXT: v_readfirstlane_b32 s21, v4
; GFX11-NEXT: v_readfirstlane_b32 s22, v5
; GFX11-NEXT: v_readfirstlane_b32 s23, v6
; GFX11-NEXT: v_readfirstlane_b32 s24, v7
-; GFX11-NEXT: v_writelane_b32 v74, s52, 12
+; GFX11-NEXT: v_writelane_b32 v74, s54, 12
; GFX11-NEXT: v_readfirstlane_b32 s25, v8
; GFX11-NEXT: v_readfirstlane_b32 s26, v9
; GFX11-NEXT: v_readfirstlane_b32 s27, v10
; GFX11-NEXT: v_readfirstlane_b32 s28, v11
-; GFX11-NEXT: v_writelane_b32 v74, s53, 13
+; GFX11-NEXT: v_writelane_b32 v74, s55, 13
; GFX11-NEXT: v_readfirstlane_b32 s29, v12
; GFX11-NEXT: v_readfirstlane_b32 s40, v13
; GFX11-NEXT: v_readfirstlane_b32 s41, v14
; GFX11-NEXT: s_mov_b32 vcc_hi, 0
-; GFX11-NEXT: v_writelane_b32 v74, s54, 14
+; GFX11-NEXT: v_writelane_b32 v74, s64, 14
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
; GFX11-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68
@@ -108267,26 +108267,26 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:8
; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:4
; GFX11-NEXT: scratch_store_b32 off, v73, s32
-; GFX11-NEXT: v_writelane_b32 v75, s104, 8
+; GFX11-NEXT: v_writelane_b32 v75, s31, 8
; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
-; GFX11-NEXT: v_writelane_b32 v74, s55, 15
-; GFX11-NEXT: v_writelane_b32 v74, s64, 16
-; GFX11-NEXT: v_writelane_b32 v74, s65, 17
-; GFX11-NEXT: v_writelane_b32 v74, s66, 18
-; GFX11-NEXT: v_writelane_b32 v74, s67, 19
-; GFX11-NEXT: v_writelane_b32 v74, s68, 20
-; GFX11-NEXT: v_writelane_b32 v74, s69, 21
-; GFX11-NEXT: v_writelane_b32 v74, s70, 22
-; GFX11-NEXT: v_writelane_b32 v74, s71, 23
-; GFX11-NEXT: v_writelane_b32 v74, s80, 24
-; GFX11-NEXT: v_writelane_b32 v74, s81, 25
-; GFX11-NEXT: v_writelane_b32 v74, s82, 26
-; GFX11-NEXT: v_writelane_b32 v74, s83, 27
-; GFX11-NEXT: v_writelane_b32 v74, s84, 28
-; GFX11-NEXT: v_writelane_b32 v74, s85, 29
-; GFX11-NEXT: v_writelane_b32 v74, s86, 30
-; GFX11-NEXT: v_writelane_b32 v74, s87, 31
+; GFX11-NEXT: v_writelane_b32 v74, s65, 15
+; GFX11-NEXT: v_writelane_b32 v74, s66, 16
+; GFX11-NEXT: v_writelane_b32 v74, s67, 17
+; GFX11-NEXT: v_writelane_b32 v74, s68, 18
+; GFX11-NEXT: v_writelane_b32 v74, s69, 19
+; GFX11-NEXT: v_writelane_b32 v74, s70, 20
+; GFX11-NEXT: v_writelane_b32 v74, s71, 21
+; GFX11-NEXT: v_writelane_b32 v74, s80, 22
+; GFX11-NEXT: v_writelane_b32 v74, s81, 23
+; GFX11-NEXT: v_writelane_b32 v74, s82, 24
+; GFX11-NEXT: v_writelane_b32 v74, s83, 25
+; GFX11-NEXT: v_writelane_b32 v74, s84, 26
+; GFX11-NEXT: v_writelane_b32 v74, s85, 27
+; GFX11-NEXT: v_writelane_b32 v74, s86, 28
+; GFX11-NEXT: v_writelane_b32 v74, s87, 29
+; GFX11-NEXT: v_writelane_b32 v74, s96, 30
+; GFX11-NEXT: v_writelane_b32 v74, s97, 31
; GFX11-NEXT: s_cbranch_scc0 .LBB73_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 16
@@ -108998,47 +108998,47 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT: v_readlane_b32 s104, v75, 8
-; GFX11-NEXT: v_readlane_b32 s103, v75, 7
-; GFX11-NEXT: v_readlane_b32 s102, v75, 6
-; GFX11-NEXT: v_readlane_b32 s101, v75, 5
-; GFX11-NEXT: v_readlane_b32 s100, v75, 4
-; GFX11-NEXT: v_readlane_b32 s99, v75, 3
-; GFX11-NEXT: v_readlane_b32 s98, v75, 2
-; GFX11-NEXT: v_readlane_b32 s97, v75, 1
-; GFX11-NEXT: v_readlane_b32 s96, v75, 0
-; GFX11-NEXT: v_readlane_b32 s87, v74, 31
-; GFX11-NEXT: v_readlane_b32 s86, v74, 30
-; GFX11-NEXT: v_readlane_b32 s85, v74, 29
-; GFX11-NEXT: v_readlane_b32 s84, v74, 28
-; GFX11-NEXT: v_readlane_b32 s83, v74, 27
-; GFX11-NEXT: v_readlane_b32 s82, v74, 26
-; GFX11-NEXT: v_readlane_b32 s81, v74, 25
-; GFX11-NEXT: v_readlane_b32 s80, v74, 24
-; GFX11-NEXT: v_readlane_b32 s71, v74, 23
-; GFX11-NEXT: v_readlane_b32 s70, v74, 22
-; GFX11-NEXT: v_readlane_b32 s69, v74, 21
-; GFX11-NEXT: v_readlane_b32 s68, v74, 20
-; GFX11-NEXT: v_readlane_b32 s67, v74, 19
-; GFX11-NEXT: v_readlane_b32 s66, v74, 18
-; GFX11-NEXT: v_readlane_b32 s65, v74, 17
-; GFX11-NEXT: v_readlane_b32 s64, v74, 16
-; GFX11-NEXT: v_readlane_b32 s55, v74, 15
-; GFX11-NEXT: v_readlane_b32 s54, v74, 14
-; GFX11-NEXT: v_readlane_b32 s53, v74, 13
-; GFX11-NEXT: v_readlane_b32 s52, v74, 12
-; GFX11-NEXT: v_readlane_b32 s51, v74, 11
-; GFX11-NEXT: v_readlane_b32 s50, v74, 10
-; GFX11-NEXT: v_readlane_b32 s49, v74, 9
-; GFX11-NEXT: v_readlane_b32 s48, v74, 8
-; GFX11-NEXT: v_readlane_b32 s39, v74, 7
-; GFX11-NEXT: v_readlane_b32 s38, v74, 6
-; GFX11-NEXT: v_readlane_b32 s37, v74, 5
-; GFX11-NEXT: v_readlane_b32 s36, v74, 4
-; GFX11-NEXT: v_readlane_b32 s35, v74, 3
-; GFX11-NEXT: v_readlane_b32 s34, v74, 2
-; GFX11-NEXT: v_readlane_b32 s31, v74, 1
-; GFX11-NEXT: v_readlane_b32 s30, v74, 0
+; GFX11-NEXT: v_readlane_b32 s30, v75, 7
+; GFX11-NEXT: v_readlane_b32 s31, v75, 8
+; GFX11-NEXT: v_readlane_b32 s104, v75, 6
+; GFX11-NEXT: v_readlane_b32 s103, v75, 5
+; GFX11-NEXT: v_readlane_b32 s102, v75, 4
+; GFX11-NEXT: v_readlane_b32 s101, v75, 3
+; GFX11-NEXT: v_readlane_b32 s100, v75, 2
+; GFX11-NEXT: v_readlane_b32 s99, v75, 1
+; GFX11-NEXT: v_readlane_b32 s98, v75, 0
+; GFX11-NEXT: v_readlane_b32 s97, v74, 31
+; GFX11-NEXT: v_readlane_b32 s96, v74, 30
+; GFX11-NEXT: v_readlane_b32 s87, v74, 29
+; GFX11-NEXT: v_readlane_b32 s86, v74, 28
+; GFX11-NEXT: v_readlane_b32 s85, v74, 27
+; GFX11-NEXT: v_readlane_b32 s84, v74, 26
+; GFX11-NEXT: v_readlane_b32 s83, v74, 25
+; GFX11-NEXT: v_readlane_b32 s82, v74, 24
+; GFX11-NEXT: v_readlane_b32 s81, v74, 23
+; GFX11-NEXT: v_readlane_b32 s80, v74, 22
+; GFX11-NEXT: v_readlane_b32 s71, v74, 21
+; GFX11-NEXT: v_readlane_b32 s70, v74, 20
+; GFX11-NEXT: v_readlane_b32 s69, v74, 19
+; GFX11-NEXT: v_readlane_b32 s68, v74, 18
+; GFX11-NEXT: v_readlane_b32 s67, v74, 17
+; GFX11-NEXT: v_readlane_b32 s66, v74, 16
+; GFX11-NEXT: v_readlane_b32 s65, v74, 15
+; GFX11-NEXT: v_readlane_b32 s64, v74, 14
+; GFX11-NEXT: v_readlane_b32 s55, v74, 13
+; GFX11-NEXT: v_readlane_b32 s54, v74, 12
+; GFX11-NEXT: v_readlane_b32 s53, v74, 11
+; GFX11-NEXT: v_readlane_b32 s52, v74, 10
+; GFX11-NEXT: v_readlane_b32 s51, v74, 9
+; GFX11-NEXT: v_readlane_b32 s50, v74, 8
+; GFX11-NEXT: v_readlane_b32 s49, v74, 7
+; GFX11-NEXT: v_readlane_b32 s48, v74, 6
+; GFX11-NEXT: v_readlane_b32 s39, v74, 5
+; GFX11-NEXT: v_readlane_b32 s38, v74, 4
+; GFX11-NEXT: v_readlane_b32 s37, v74, 3
+; GFX11-NEXT: v_readlane_b32 s36, v74, 2
+; GFX11-NEXT: v_readlane_b32 s35, v74, 1
+; GFX11-NEXT: v_readlane_b32 s34, v74, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72
@@ -120019,70 +120019,70 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v63, s30, 0
-; SI-NEXT: v_writelane_b32 v63, s31, 1
-; SI-NEXT: v_writelane_b32 v63, s34, 2
-; SI-NEXT: v_writelane_b32 v63, s35, 3
-; SI-NEXT: v_writelane_b32 v63, s36, 4
-; SI-NEXT: v_writelane_b32 v63, s37, 5
-; SI-NEXT: v_writelane_b32 v63, s38, 6
-; SI-NEXT: v_writelane_b32 v63, s39, 7
-; SI-NEXT: v_writelane_b32 v63, s48, 8
-; SI-NEXT: v_writelane_b32 v63, s49, 9
-; SI-NEXT: v_writelane_b32 v63, s50, 10
-; SI-NEXT: v_writelane_b32 v63, s51, 11
-; SI-NEXT: v_writelane_b32 v63, s52, 12
-; SI-NEXT: v_writelane_b32 v63, s53, 13
-; SI-NEXT: v_writelane_b32 v63, s54, 14
-; SI-NEXT: v_writelane_b32 v63, s55, 15
-; SI-NEXT: v_writelane_b32 v63, s64, 16
-; SI-NEXT: v_writelane_b32 v63, s65, 17
-; SI-NEXT: v_writelane_b32 v63, s66, 18
-; SI-NEXT: v_writelane_b32 v63, s67, 19
-; SI-NEXT: v_writelane_b32 v63, s68, 20
-; SI-NEXT: v_writelane_b32 v63, s69, 21
+; SI-NEXT: v_writelane_b32 v63, s34, 0
+; SI-NEXT: v_writelane_b32 v63, s35, 1
+; SI-NEXT: v_writelane_b32 v63, s36, 2
+; SI-NEXT: v_writelane_b32 v63, s37, 3
+; SI-NEXT: v_writelane_b32 v63, s38, 4
+; SI-NEXT: v_writelane_b32 v63, s39, 5
+; SI-NEXT: v_writelane_b32 v63, s48, 6
+; SI-NEXT: v_writelane_b32 v63, s49, 7
+; SI-NEXT: v_writelane_b32 v63, s50, 8
+; SI-NEXT: v_writelane_b32 v63, s51, 9
+; SI-NEXT: v_writelane_b32 v63, s52, 10
+; SI-NEXT: v_writelane_b32 v63, s53, 11
+; SI-NEXT: v_writelane_b32 v63, s54, 12
+; SI-NEXT: v_writelane_b32 v63, s55, 13
+; SI-NEXT: v_writelane_b32 v63, s64, 14
+; SI-NEXT: v_writelane_b32 v63, s65, 15
+; SI-NEXT: v_writelane_b32 v63, s66, 16
+; SI-NEXT: v_writelane_b32 v63, s67, 17
+; SI-NEXT: v_writelane_b32 v63, s68, 18
+; SI-NEXT: v_writelane_b32 v63, s69, 19
+; SI-NEXT: v_writelane_b32 v63, s70, 20
+; SI-NEXT: v_writelane_b32 v63, s71, 21
; SI-NEXT: v_mov_b32_e32 v19, s16
-; SI-NEXT: v_writelane_b32 v63, s70, 22
+; SI-NEXT: v_writelane_b32 v63, s80, 22
; SI-NEXT: v_readfirstlane_b32 s4, v19
; SI-NEXT: v_mov_b32_e32 v19, s17
-; SI-NEXT: v_writelane_b32 v63, s71, 23
+; SI-NEXT: v_writelane_b32 v63, s81, 23
; SI-NEXT: v_readfirstlane_b32 s5, v19
; SI-NEXT: v_mov_b32_e32 v19, s18
-; SI-NEXT: v_writelane_b32 v63, s80, 24
+; SI-NEXT: v_writelane_b32 v63, s82, 24
; SI-NEXT: v_readfirstlane_b32 s6, v19
; SI-NEXT: v_mov_b32_e32 v19, s19
-; SI-NEXT: v_writelane_b32 v63, s81, 25
+; SI-NEXT: v_writelane_b32 v63, s83, 25
; SI-NEXT: v_readfirstlane_b32 s7, v19
; SI-NEXT: v_mov_b32_e32 v19, s20
-; SI-NEXT: v_writelane_b32 v63, s82, 26
+; SI-NEXT: v_writelane_b32 v63, s84, 26
; SI-NEXT: v_readfirstlane_b32 s8, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
-; SI-NEXT: v_writelane_b32 v63, s83, 27
+; SI-NEXT: v_writelane_b32 v63, s85, 27
; SI-NEXT: v_readfirstlane_b32 s9, v19
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v63, s84, 28
+; SI-NEXT: v_writelane_b32 v63, s86, 28
; SI-NEXT: v_readfirstlane_b32 s20, v19
; SI-NEXT: v_mov_b32_e32 v19, s23
-; SI-NEXT: v_writelane_b32 v63, s85, 29
+; SI-NEXT: v_writelane_b32 v63, s87, 29
; SI-NEXT: v_readfirstlane_b32 s21, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
-; SI-NEXT: v_writelane_b32 v63, s86, 30
+; SI-NEXT: v_writelane_b32 v63, s96, 30
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
-; SI-NEXT: v_writelane_b32 v63, s87, 31
+; SI-NEXT: v_writelane_b32 v63, s97, 31
; SI-NEXT: v_readfirstlane_b32 s25, v19
; SI-NEXT: v_mov_b32_e32 v19, s26
-; SI-NEXT: v_writelane_b32 v63, s96, 32
+; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
-; SI-NEXT: v_writelane_b32 v63, s97, 33
+; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_readfirstlane_b32 s41, v19
; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_writelane_b32 v63, s98, 34
+; SI-NEXT: v_writelane_b32 v63, s30, 34
; SI-NEXT: v_readfirstlane_b32 s42, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT: v_writelane_b32 v63, s99, 35
+; SI-NEXT: v_writelane_b32 v63, s31, 35
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_readfirstlane_b32 s44, v0
; SI-NEXT: v_readfirstlane_b32 s45, v1
@@ -120545,42 +120545,42 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54
; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52
; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38
-; SI-NEXT: v_readlane_b32 s99, v63, 35
-; SI-NEXT: v_readlane_b32 s98, v63, 34
-; SI-NEXT: v_readlane_b32 s97, v63, 33
-; SI-NEXT: v_readlane_b32 s96, v63, 32
-; SI-NEXT: v_readlane_b32 s87, v63, 31
-; SI-NEXT: v_readlane_b32 s86, v63, 30
-; SI-NEXT: v_readlane_b32 s85, v63, 29
-; SI-NEXT: v_readlane_b32 s84, v63, 28
-; SI-NEXT: v_readlane_b32 s83, v63, 27
-; SI-NEXT: v_readlane_b32 s82, v63, 26
-; SI-NEXT: v_readlane_b32 s81, v63, 25
-; SI-NEXT: v_readlane_b32 s80, v63, 24
-; SI-NEXT: v_readlane_b32 s71, v63, 23
-; SI-NEXT: v_readlane_b32 s70, v63, 22
-; SI-NEXT: v_readlane_b32 s69, v63, 21
-; SI-NEXT: v_readlane_b32 s68, v63, 20
-; SI-NEXT: v_readlane_b32 s67, v63, 19
-; SI-NEXT: v_readlane_b32 s66, v63, 18
-; SI-NEXT: v_readlane_b32 s65, v63, 17
-; SI-NEXT: v_readlane_b32 s64, v63, 16
-; SI-NEXT: v_readlane_b32 s55, v63, 15
-; SI-NEXT: v_readlane_b32 s54, v63, 14
-; SI-NEXT: v_readlane_b32 s53, v63, 13
-; SI-NEXT: v_readlane_b32 s52, v63, 12
-; SI-NEXT: v_readlane_b32 s51, v63, 11
-; SI-NEXT: v_readlane_b32 s50, v63, 10
-; SI-NEXT: v_readlane_b32 s49, v63, 9
-; SI-NEXT: v_readlane_b32 s48, v63, 8
-; SI-NEXT: v_readlane_b32 s39, v63, 7
-; SI-NEXT: v_readlane_b32 s38, v63, 6
-; SI-NEXT: v_readlane_b32 s37, v63, 5
-; SI-NEXT: v_readlane_b32 s36, v63, 4
-; SI-NEXT: v_readlane_b32 s35, v63, 3
-; SI-NEXT: v_readlane_b32 s34, v63, 2
-; SI-NEXT: v_readlane_b32 s31, v63, 1
-; SI-NEXT: v_readlane_b32 s30, v63, 0
+; SI-NEXT: v_readlane_b32 s30, v63, 34
+; SI-NEXT: v_readlane_b32 s31, v63, 35
+; SI-NEXT: v_readlane_b32 s99, v63, 33
+; SI-NEXT: v_readlane_b32 s98, v63, 32
+; SI-NEXT: v_readlane_b32 s97, v63, 31
+; SI-NEXT: v_readlane_b32 s96, v63, 30
+; SI-NEXT: v_readlane_b32 s87, v63, 29
+; SI-NEXT: v_readlane_b32 s86, v63, 28
+; SI-NEXT: v_readlane_b32 s85, v63, 27
+; SI-NEXT: v_readlane_b32 s84, v63, 26
+; SI-NEXT: v_readlane_b32 s83, v63, 25
+; SI-NEXT: v_readlane_b32 s82, v63, 24
+; SI-NEXT: v_readlane_b32 s81, v63, 23
+; SI-NEXT: v_readlane_b32 s80, v63, 22
+; SI-NEXT: v_readlane_b32 s71, v63, 21
+; SI-NEXT: v_readlane_b32 s70, v63, 20
+; SI-NEXT: v_readlane_b32 s69, v63, 19
+; SI-NEXT: v_readlane_b32 s68, v63, 18
+; SI-NEXT: v_readlane_b32 s67, v63, 17
+; SI-NEXT: v_readlane_b32 s66, v63, 16
+; SI-NEXT: v_readlane_b32 s65, v63, 15
+; SI-NEXT: v_readlane_b32 s64, v63, 14
+; SI-NEXT: v_readlane_b32 s55, v63, 13
+; SI-NEXT: v_readlane_b32 s54, v63, 12
+; SI-NEXT: v_readlane_b32 s53, v63, 11
+; SI-NEXT: v_readlane_b32 s52, v63, 10
+; SI-NEXT: v_readlane_b32 s51, v63, 9
+; SI-NEXT: v_readlane_b32 s50, v63, 8
+; SI-NEXT: v_readlane_b32 s49, v63, 7
+; SI-NEXT: v_readlane_b32 s48, v63, 6
+; SI-NEXT: v_readlane_b32 s39, v63, 5
+; SI-NEXT: v_readlane_b32 s38, v63, 4
+; SI-NEXT: v_readlane_b32 s37, v63, 3
+; SI-NEXT: v_readlane_b32 s36, v63, 2
+; SI-NEXT: v_readlane_b32 s35, v63, 1
+; SI-NEXT: v_readlane_b32 s34, v63, 0
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -138829,42 +138829,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_mov_b32 s76, s28
; SI-NEXT: s_mov_b32 s77, s26
; SI-NEXT: s_mov_b32 s79, s24
-; SI-NEXT: v_writelane_b32 v41, s30, 0
-; SI-NEXT: v_writelane_b32 v41, s31, 1
-; SI-NEXT: v_writelane_b32 v41, s34, 2
-; SI-NEXT: v_writelane_b32 v41, s35, 3
-; SI-NEXT: v_writelane_b32 v41, s36, 4
-; SI-NEXT: v_writelane_b32 v41, s37, 5
-; SI-NEXT: v_writelane_b32 v41, s38, 6
-; SI-NEXT: v_writelane_b32 v41, s39, 7
-; SI-NEXT: v_writelane_b32 v41, s48, 8
-; SI-NEXT: v_writelane_b32 v41, s49, 9
-; SI-NEXT: v_writelane_b32 v41, s50, 10
-; SI-NEXT: v_writelane_b32 v41, s51, 11
-; SI-NEXT: v_writelane_b32 v41, s52, 12
-; SI-NEXT: v_writelane_b32 v41, s53, 13
-; SI-NEXT: v_writelane_b32 v41, s54, 14
-; SI-NEXT: v_writelane_b32 v41, s55, 15
-; SI-NEXT: v_writelane_b32 v41, s64, 16
-; SI-NEXT: v_writelane_b32 v41, s65, 17
-; SI-NEXT: v_writelane_b32 v41, s66, 18
-; SI-NEXT: v_writelane_b32 v41, s67, 19
-; SI-NEXT: v_writelane_b32 v41, s68, 20
-; SI-NEXT: v_writelane_b32 v41, s69, 21
-; SI-NEXT: v_writelane_b32 v41, s70, 22
-; SI-NEXT: v_writelane_b32 v41, s71, 23
-; SI-NEXT: v_writelane_b32 v41, s80, 24
-; SI-NEXT: v_writelane_b32 v41, s81, 25
-; SI-NEXT: v_writelane_b32 v41, s82, 26
-; SI-NEXT: v_writelane_b32 v41, s83, 27
-; SI-NEXT: v_writelane_b32 v41, s84, 28
-; SI-NEXT: v_writelane_b32 v41, s85, 29
-; SI-NEXT: v_writelane_b32 v41, s86, 30
-; SI-NEXT: v_writelane_b32 v41, s87, 31
-; SI-NEXT: v_writelane_b32 v41, s96, 32
-; SI-NEXT: v_writelane_b32 v41, s97, 33
-; SI-NEXT: v_writelane_b32 v41, s98, 34
-; SI-NEXT: v_writelane_b32 v41, s99, 35
+; SI-NEXT: v_writelane_b32 v41, s34, 0
+; SI-NEXT: v_writelane_b32 v41, s35, 1
+; SI-NEXT: v_writelane_b32 v41, s36, 2
+; SI-NEXT: v_writelane_b32 v41, s37, 3
+; SI-NEXT: v_writelane_b32 v41, s38, 4
+; SI-NEXT: v_writelane_b32 v41, s39, 5
+; SI-NEXT: v_writelane_b32 v41, s48, 6
+; SI-NEXT: v_writelane_b32 v41, s49, 7
+; SI-NEXT: v_writelane_b32 v41, s50, 8
+; SI-NEXT: v_writelane_b32 v41, s51, 9
+; SI-NEXT: v_writelane_b32 v41, s52, 10
+; SI-NEXT: v_writelane_b32 v41, s53, 11
+; SI-NEXT: v_writelane_b32 v41, s54, 12
+; SI-NEXT: v_writelane_b32 v41, s55, 13
+; SI-NEXT: v_writelane_b32 v41, s64, 14
+; SI-NEXT: v_writelane_b32 v41, s65, 15
+; SI-NEXT: v_writelane_b32 v41, s66, 16
+; SI-NEXT: v_writelane_b32 v41, s67, 17
+; SI-NEXT: v_writelane_b32 v41, s68, 18
+; SI-NEXT: v_writelane_b32 v41, s69, 19
+; SI-NEXT: v_writelane_b32 v41, s70, 20
+; SI-NEXT: v_writelane_b32 v41, s71, 21
+; SI-NEXT: v_writelane_b32 v41, s80, 22
+; SI-NEXT: v_writelane_b32 v41, s81, 23
+; SI-NEXT: v_writelane_b32 v41, s82, 24
+; SI-NEXT: v_writelane_b32 v41, s83, 25
+; SI-NEXT: v_writelane_b32 v41, s84, 26
+; SI-NEXT: v_writelane_b32 v41, s85, 27
+; SI-NEXT: v_writelane_b32 v41, s86, 28
+; SI-NEXT: v_writelane_b32 v41, s87, 29
+; SI-NEXT: v_writelane_b32 v41, s96, 30
+; SI-NEXT: v_writelane_b32 v41, s97, 31
+; SI-NEXT: v_writelane_b32 v41, s98, 32
+; SI-NEXT: v_writelane_b32 v41, s99, 33
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
@@ -138895,6 +138893,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s18, v0
; SI-NEXT: v_readfirstlane_b32 s19, v1
; SI-NEXT: v_readfirstlane_b32 s89, v4
+; SI-NEXT: v_readfirstlane_b32 s90, v5
+; SI-NEXT: v_readfirstlane_b32 s91, v3
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296
@@ -138927,12 +138927,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v44, s4, 10
; SI-NEXT: v_readfirstlane_b32 s4, v38
; SI-NEXT: v_writelane_b32 v44, s4, 11
-; SI-NEXT: v_readfirstlane_b32 s90, v5
-; SI-NEXT: v_readfirstlane_b32 s91, v3
; SI-NEXT: v_readfirstlane_b32 s92, v2
; SI-NEXT: v_readfirstlane_b32 s93, v8
; SI-NEXT: v_readfirstlane_b32 s94, v9
; SI-NEXT: v_readfirstlane_b32 s95, v7
+; SI-NEXT: v_writelane_b32 v41, s30, 34
+; SI-NEXT: v_writelane_b32 v41, s31, 35
; SI-NEXT: v_readfirstlane_b32 s30, v13
; SI-NEXT: v_readfirstlane_b32 s31, v11
; SI-NEXT: v_readfirstlane_b32 s34, v10
@@ -140314,43 +140314,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_mul_f32_e64 v31, 1.0, s83
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31
; SI-NEXT: v_mul_f32_e64 v31, 1.0, s84
+; SI-NEXT: v_readlane_b32 s30, v41, 34
; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT: v_readlane_b32 s99, v41, 35
-; SI-NEXT: v_readlane_b32 s98, v41, 34
-; SI-NEXT: v_readlane_b32 s97, v41, 33
-; SI-NEXT: v_readlane_b32 s96, v41, 32
-; SI-NEXT: v_readlane_b32 s87, v41, 31
-; SI-NEXT: v_readlane_b32 s86, v41, 30
-; SI-NEXT: v_readlane_b32 s85, v41, 29
-; SI-NEXT: v_readlane_b32 s84, v41, 28
-; SI-NEXT: v_readlane_b32 s83, v41, 27
-; SI-NEXT: v_readlane_b32 s82, v41, 26
-; SI-NEXT: v_readlane_b32 s81, v41, 25
-; SI-NEXT: v_readlane_b32 s80, v41, 24
-; SI-NEXT: v_readlane_b32 s71, v41, 23
-; SI-NEXT: v_readlane_b32 s70, v41, 22
-; SI-NEXT: v_readlane_b32 s69, v41, 21
-; SI-NEXT: v_readlane_b32 s68, v41, 20
-; SI-NEXT: v_readlane_b32 s67, v41, 19
-; SI-NEXT: v_readlane_b32 s66, v41, 18
-; SI-NEXT: v_readlane_b32 s65, v41, 17
-; SI-NEXT: v_readlane_b32 s64, v41, 16
-; SI-NEXT: v_readlane_b32 s55, v41, 15
-; SI-NEXT: v_readlane_b32 s54, v41, 14
-; SI-NEXT: v_readlane_b32 s53, v41, 13
-; SI-NEXT: v_readlane_b32 s52, v41, 12
-; SI-NEXT: v_readlane_b32 s51, v41, 11
-; SI-NEXT: v_readlane_b32 s50, v41, 10
-; SI-NEXT: v_readlane_b32 s49, v41, 9
-; SI-NEXT: v_readlane_b32 s48, v41, 8
-; SI-NEXT: v_readlane_b32 s39, v41, 7
-; SI-NEXT: v_readlane_b32 s38, v41, 6
-; SI-NEXT: v_readlane_b32 s37, v41, 5
-; SI-NEXT: v_readlane_b32 s36, v41, 4
-; SI-NEXT: v_readlane_b32 s35, v41, 3
-; SI-NEXT: v_readlane_b32 s34, v41, 2
-; SI-NEXT: v_readlane_b32 s31, v41, 1
-; SI-NEXT: v_readlane_b32 s30, v41, 0
+; SI-NEXT: v_readlane_b32 s31, v41, 35
+; SI-NEXT: v_readlane_b32 s99, v41, 33
+; SI-NEXT: v_readlane_b32 s98, v41, 32
+; SI-NEXT: v_readlane_b32 s97, v41, 31
+; SI-NEXT: v_readlane_b32 s96, v41, 30
+; SI-NEXT: v_readlane_b32 s87, v41, 29
+; SI-NEXT: v_readlane_b32 s86, v41, 28
+; SI-NEXT: v_readlane_b32 s85, v41, 27
+; SI-NEXT: v_readlane_b32 s84, v41, 26
+; SI-NEXT: v_readlane_b32 s83, v41, 25
+; SI-NEXT: v_readlane_b32 s82, v41, 24
+; SI-NEXT: v_readlane_b32 s81, v41, 23
+; SI-NEXT: v_readlane_b32 s80, v41, 22
+; SI-NEXT: v_readlane_b32 s71, v41, 21
+; SI-NEXT: v_readlane_b32 s70, v41, 20
+; SI-NEXT: v_readlane_b32 s69, v41, 19
+; SI-NEXT: v_readlane_b32 s68, v41, 18
+; SI-NEXT: v_readlane_b32 s67, v41, 17
+; SI-NEXT: v_readlane_b32 s66, v41, 16
+; SI-NEXT: v_readlane_b32 s65, v41, 15
+; SI-NEXT: v_readlane_b32 s64, v41, 14
+; SI-NEXT: v_readlane_b32 s55, v41, 13
+; SI-NEXT: v_readlane_b32 s54, v41, 12
+; SI-NEXT: v_readlane_b32 s53, v41, 11
+; SI-NEXT: v_readlane_b32 s52, v41, 10
+; SI-NEXT: v_readlane_b32 s51, v41, 9
+; SI-NEXT: v_readlane_b32 s50, v41, 8
+; SI-NEXT: v_readlane_b32 s49, v41, 7
+; SI-NEXT: v_readlane_b32 s48, v41, 6
+; SI-NEXT: v_readlane_b32 s39, v41, 5
+; SI-NEXT: v_readlane_b32 s38, v41, 4
+; SI-NEXT: v_readlane_b32 s37, v41, 3
+; SI-NEXT: v_readlane_b32 s36, v41, 2
+; SI-NEXT: v_readlane_b32 s35, v41, 1
+; SI-NEXT: v_readlane_b32 s34, v41, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
@@ -150972,38 +150972,38 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v63, s30, 0
-; SI-NEXT: v_writelane_b32 v63, s31, 1
-; SI-NEXT: v_writelane_b32 v63, s34, 2
-; SI-NEXT: v_writelane_b32 v63, s35, 3
-; SI-NEXT: v_writelane_b32 v63, s36, 4
-; SI-NEXT: v_writelane_b32 v63, s37, 5
-; SI-NEXT: v_writelane_b32 v63, s38, 6
-; SI-NEXT: v_writelane_b32 v63, s39, 7
-; SI-NEXT: v_writelane_b32 v63, s48, 8
-; SI-NEXT: v_writelane_b32 v63, s49, 9
-; SI-NEXT: v_writelane_b32 v63, s50, 10
-; SI-NEXT: v_writelane_b32 v63, s51, 11
-; SI-NEXT: v_writelane_b32 v63, s52, 12
-; SI-NEXT: v_writelane_b32 v63, s53, 13
-; SI-NEXT: v_writelane_b32 v63, s54, 14
-; SI-NEXT: v_writelane_b32 v63, s55, 15
-; SI-NEXT: v_writelane_b32 v63, s64, 16
-; SI-NEXT: v_writelane_b32 v63, s65, 17
-; SI-NEXT: v_writelane_b32 v63, s66, 18
-; SI-NEXT: v_writelane_b32 v63, s67, 19
-; SI-NEXT: v_writelane_b32 v63, s68, 20
-; SI-NEXT: v_writelane_b32 v63, s69, 21
-; SI-NEXT: v_writelane_b32 v63, s70, 22
-; SI-NEXT: v_writelane_b32 v63, s71, 23
-; SI-NEXT: v_writelane_b32 v63, s80, 24
-; SI-NEXT: v_writelane_b32 v63, s81, 25
-; SI-NEXT: v_writelane_b32 v63, s82, 26
-; SI-NEXT: v_writelane_b32 v63, s83, 27
-; SI-NEXT: v_writelane_b32 v63, s84, 28
-; SI-NEXT: v_writelane_b32 v63, s85, 29
-; SI-NEXT: v_writelane_b32 v63, s86, 30
-; SI-NEXT: v_writelane_b32 v63, s87, 31
+; SI-NEXT: v_writelane_b32 v63, s34, 0
+; SI-NEXT: v_writelane_b32 v63, s35, 1
+; SI-NEXT: v_writelane_b32 v63, s36, 2
+; SI-NEXT: v_writelane_b32 v63, s37, 3
+; SI-NEXT: v_writelane_b32 v63, s38, 4
+; SI-NEXT: v_writelane_b32 v63, s39, 5
+; SI-NEXT: v_writelane_b32 v63, s48, 6
+; SI-NEXT: v_writelane_b32 v63, s49, 7
+; SI-NEXT: v_writelane_b32 v63, s50, 8
+; SI-NEXT: v_writelane_b32 v63, s51, 9
+; SI-NEXT: v_writelane_b32 v63, s52, 10
+; SI-NEXT: v_writelane_b32 v63, s53, 11
+; SI-NEXT: v_writelane_b32 v63, s54, 12
+; SI-NEXT: v_writelane_b32 v63, s55, 13
+; SI-NEXT: v_writelane_b32 v63, s64, 14
+; SI-NEXT: v_writelane_b32 v63, s65, 15
+; SI-NEXT: v_writelane_b32 v63, s66, 16
+; SI-NEXT: v_writelane_b32 v63, s67, 17
+; SI-NEXT: v_writelane_b32 v63, s68, 18
+; SI-NEXT: v_writelane_b32 v63, s69, 19
+; SI-NEXT: v_writelane_b32 v63, s70, 20
+; SI-NEXT: v_writelane_b32 v63, s71, 21
+; SI-NEXT: v_writelane_b32 v63, s80, 22
+; SI-NEXT: v_writelane_b32 v63, s81, 23
+; SI-NEXT: v_writelane_b32 v63, s82, 24
+; SI-NEXT: v_writelane_b32 v63, s83, 25
+; SI-NEXT: v_writelane_b32 v63, s84, 26
+; SI-NEXT: v_writelane_b32 v63, s85, 27
+; SI-NEXT: v_writelane_b32 v63, s86, 28
+; SI-NEXT: v_writelane_b32 v63, s87, 29
+; SI-NEXT: v_writelane_b32 v63, s96, 30
+; SI-NEXT: v_writelane_b32 v63, s97, 31
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17
; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v7
@@ -151023,13 +151023,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT: v_writelane_b32 v63, s96, 32
+; SI-NEXT: v_writelane_b32 v63, s98, 32
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v7, 1.0, v19
-; SI-NEXT: v_writelane_b32 v63, s97, 33
+; SI-NEXT: v_writelane_b32 v63, s99, 33
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v18
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v14
; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3
@@ -151037,7 +151037,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v7, 1.0, v17
-; SI-NEXT: v_writelane_b32 v63, s98, 34
+; SI-NEXT: v_writelane_b32 v63, s30, 34
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -151097,7 +151097,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v7, 1.0, v20
-; SI-NEXT: v_writelane_b32 v63, s99, 35
+; SI-NEXT: v_writelane_b32 v63, s31, 35
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mul_f32_e32 v39, 1.0, v29
; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1
@@ -152807,6 +152807,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: v_readlane_b32 s30, v63, 34
; SI-NEXT: v_readlane_b32 s61, v62, 35
; SI-NEXT: v_readlane_b32 s43, v62, 47
; SI-NEXT: v_readlane_b32 s27, v61, 7
@@ -152814,42 +152815,41 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s17, v61, 19
; SI-NEXT: v_readlane_b32 s11, v61, 25
; SI-NEXT: v_readlane_b32 s9, v61, 33
-; SI-NEXT: v_readlane_b32 s99, v63, 35
-; SI-NEXT: v_readlane_b32 s98, v63, 34
-; SI-NEXT: v_readlane_b32 s97, v63, 33
-; SI-NEXT: v_readlane_b32 s96, v63, 32
-; SI-NEXT: v_readlane_b32 s87, v63, 31
-; SI-NEXT: v_readlane_b32 s86, v63, 30
-; SI-NEXT: v_readlane_b32 s85, v63, 29
-; SI-NEXT: v_readlane_b32 s84, v63, 28
-; SI-NEXT: v_readlane_b32 s83, v63, 27
-; SI-NEXT: v_readlane_b32 s82, v63, 26
-; SI-NEXT: v_readlane_b32 s81, v63, 25
-; SI-NEXT: v_readlane_b32 s80, v63, 24
-; SI-NEXT: v_readlane_b32 s71, v63, 23
-; SI-NEXT: v_readlane_b32 s70, v63, 22
-; SI-NEXT: v_readlane_b32 s69, v63, 21
-; SI-NEXT: v_readlane_b32 s68, v63, 20
-; SI-NEXT: v_readlane_b32 s67, v63, 19
-; SI-NEXT: v_readlane_b32 s66, v63, 18
-; SI-NEXT: v_readlane_b32 s65, v63, 17
-; SI-NEXT: v_readlane_b32 s64, v63, 16
-; SI-NEXT: v_readlane_b32 s55, v63, 15
-; SI-NEXT: v_readlane_b32 s54, v63, 14
-; SI-NEXT: v_readlane_b32 s53, v63, 13
-; SI-NEXT: v_readlane_b32 s52, v63, 12
-; SI-NEXT: v_readlane_b32 s51, v63, 11
-; SI-NEXT: v_readlane_b32 s50, v63, 10
-; SI-NEXT: v_readlane_b32 s49, v63, 9
-; SI-NEXT: v_readlane_b32 s48, v63, 8
-; SI-NEXT: v_readlane_b32 s39, v63, 7
-; SI-NEXT: v_readlane_b32 s38, v63, 6
-; SI-NEXT: v_readlane_b32 s37, v63, 5
-; SI-NEXT: v_readlane_b32 s36, v63, 4
-; SI-NEXT: v_readlane_b32 s35, v63, 3
-; SI-NEXT: v_readlane_b32 s34, v63, 2
-; SI-NEXT: v_readlane_b32 s31, v63, 1
-; SI-NEXT: v_readlane_b32 s30, v63, 0
+; SI-NEXT: v_readlane_b32 s31, v63, 35
+; SI-NEXT: v_readlane_b32 s99, v63, 33
+; SI-NEXT: v_readlane_b32 s98, v63, 32
+; SI-NEXT: v_readlane_b32 s97, v63, 31
+; SI-NEXT: v_readlane_b32 s96, v63, 30
+; SI-NEXT: v_readlane_b32 s87, v63, 29
+; SI-NEXT: v_readlane_b32 s86, v63, 28
+; SI-NEXT: v_readlane_b32 s85, v63, 27
+; SI-NEXT: v_readlane_b32 s84, v63, 26
+; SI-NEXT: v_readlane_b32 s83, v63, 25
+; SI-NEXT: v_readlane_b32 s82, v63, 24
+; SI-NEXT: v_readlane_b32 s81, v63, 23
+; SI-NEXT: v_readlane_b32 s80, v63, 22
+; SI-NEXT: v_readlane_b32 s71, v63, 21
+; SI-NEXT: v_readlane_b32 s70, v63, 20
+; SI-NEXT: v_readlane_b32 s69, v63, 19
+; SI-NEXT: v_readlane_b32 s68, v63, 18
+; SI-NEXT: v_readlane_b32 s67, v63, 17
+; SI-NEXT: v_readlane_b32 s66, v63, 16
+; SI-NEXT: v_readlane_b32 s65, v63, 15
+; SI-NEXT: v_readlane_b32 s64, v63, 14
+; SI-NEXT: v_readlane_b32 s55, v63, 13
+; SI-NEXT: v_readlane_b32 s54, v63, 12
+; SI-NEXT: v_readlane_b32 s53, v63, 11
+; SI-NEXT: v_readlane_b32 s52, v63, 10
+; SI-NEXT: v_readlane_b32 s51, v63, 9
+; SI-NEXT: v_readlane_b32 s50, v63, 8
+; SI-NEXT: v_readlane_b32 s49, v63, 7
+; SI-NEXT: v_readlane_b32 s48, v63, 6
+; SI-NEXT: v_readlane_b32 s39, v63, 5
+; SI-NEXT: v_readlane_b32 s38, v63, 4
+; SI-NEXT: v_readlane_b32 s37, v63, 3
+; SI-NEXT: v_readlane_b32 s36, v63, 2
+; SI-NEXT: v_readlane_b32 s35, v63, 1
+; SI-NEXT: v_readlane_b32 s34, v63, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; SI-NEXT: v_or_b32_e32 v1, s5, v1
@@ -152883,37 +152883,37 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v63, s30, 0
-; VI-NEXT: v_writelane_b32 v63, s31, 1
-; VI-NEXT: v_writelane_b32 v63, s34, 2
-; VI-NEXT: v_writelane_b32 v63, s35, 3
-; VI-NEXT: v_writelane_b32 v63, s36, 4
-; VI-NEXT: v_writelane_b32 v63, s37, 5
-; VI-NEXT: v_writelane_b32 v63, s38, 6
-; VI-NEXT: v_writelane_b32 v63, s39, 7
-; VI-NEXT: v_writelane_b32 v63, s48, 8
-; VI-NEXT: v_writelane_b32 v63, s49, 9
-; VI-NEXT: v_writelane_b32 v63, s50, 10
-; VI-NEXT: v_writelane_b32 v63, s51, 11
-; VI-NEXT: v_writelane_b32 v63, s52, 12
-; VI-NEXT: v_writelane_b32 v63, s53, 13
-; VI-NEXT: v_writelane_b32 v63, s54, 14
-; VI-NEXT: v_writelane_b32 v63, s55, 15
-; VI-NEXT: v_writelane_b32 v63, s64, 16
-; VI-NEXT: v_writelane_b32 v63, s65, 17
-; VI-NEXT: v_writelane_b32 v63, s66, 18
-; VI-NEXT: v_writelane_b32 v63, s67, 19
-; VI-NEXT: v_writelane_b32 v63, s68, 20
-; VI-NEXT: v_writelane_b32 v63, s69, 21
-; VI-NEXT: v_writelane_b32 v63, s70, 22
-; VI-NEXT: v_writelane_b32 v63, s71, 23
-; VI-NEXT: v_writelane_b32 v63, s80, 24
-; VI-NEXT: v_writelane_b32 v63, s81, 25
-; VI-NEXT: v_writelane_b32 v63, s82, 26
-; VI-NEXT: v_writelane_b32 v63, s83, 27
-; VI-NEXT: v_writelane_b32 v63, s84, 28
-; VI-NEXT: v_writelane_b32 v63, s85, 29
-; VI-NEXT: v_writelane_b32 v63, s86, 30
+; VI-NEXT: v_writelane_b32 v63, s34, 0
+; VI-NEXT: v_writelane_b32 v63, s35, 1
+; VI-NEXT: v_writelane_b32 v63, s36, 2
+; VI-NEXT: v_writelane_b32 v63, s37, 3
+; VI-NEXT: v_writelane_b32 v63, s38, 4
+; VI-NEXT: v_writelane_b32 v63, s39, 5
+; VI-NEXT: v_writelane_b32 v63, s48, 6
+; VI-NEXT: v_writelane_b32 v63, s49, 7
+; VI-NEXT: v_writelane_b32 v63, s50, 8
+; VI-NEXT: v_writelane_b32 v63, s51, 9
+; VI-NEXT: v_writelane_b32 v63, s52, 10
+; VI-NEXT: v_writelane_b32 v63, s53, 11
+; VI-NEXT: v_writelane_b32 v63, s54, 12
+; VI-NEXT: v_writelane_b32 v63, s55, 13
+; VI-NEXT: v_writelane_b32 v63, s64, 14
+; VI-NEXT: v_writelane_b32 v63, s65, 15
+; VI-NEXT: v_writelane_b32 v63, s66, 16
+; VI-NEXT: v_writelane_b32 v63, s67, 17
+; VI-NEXT: v_writelane_b32 v63, s68, 18
+; VI-NEXT: v_writelane_b32 v63, s69, 19
+; VI-NEXT: v_writelane_b32 v63, s70, 20
+; VI-NEXT: v_writelane_b32 v63, s71, 21
+; VI-NEXT: v_writelane_b32 v63, s80, 22
+; VI-NEXT: v_writelane_b32 v63, s81, 23
+; VI-NEXT: v_writelane_b32 v63, s82, 24
+; VI-NEXT: v_writelane_b32 v63, s83, 25
+; VI-NEXT: v_writelane_b32 v63, s84, 26
+; VI-NEXT: v_writelane_b32 v63, s85, 27
+; VI-NEXT: v_writelane_b32 v63, s86, 28
+; VI-NEXT: v_writelane_b32 v63, s87, 29
+; VI-NEXT: v_writelane_b32 v63, s30, 30
; VI-NEXT: v_readfirstlane_b32 s56, v3
; VI-NEXT: v_mov_b32_e32 v3, s16
; VI-NEXT: v_readfirstlane_b32 s57, v4
@@ -152943,7 +152943,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readfirstlane_b32 s23, v16
; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; VI-NEXT: v_writelane_b32 v63, s87, 31
+; VI-NEXT: v_writelane_b32 v63, s31, 31
; VI-NEXT: v_readfirstlane_b32 s20, v17
; VI-NEXT: v_readfirstlane_b32 s21, v18
; VI-NEXT: v_readfirstlane_b32 s18, v3
@@ -154414,38 +154414,38 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; VI-NEXT: v_perm_b32 v5, v32, v34, s4
-; VI-NEXT: v_readlane_b32 s87, v63, 31
-; VI-NEXT: v_readlane_b32 s86, v63, 30
-; VI-NEXT: v_readlane_b32 s85, v63, 29
-; VI-NEXT: v_readlane_b32 s84, v63, 28
-; VI-NEXT: v_readlane_b32 s83, v63, 27
-; VI-NEXT: v_readlane_b32 s82, v63, 26
-; VI-NEXT: v_readlane_b32 s81, v63, 25
-; VI-NEXT: v_readlane_b32 s80, v63, 24
-; VI-NEXT: v_readlane_b32 s71, v63, 23
-; VI-NEXT: v_readlane_b32 s70, v63, 22
-; VI-NEXT: v_readlane_b32 s69, v63, 21
-; VI-NEXT: v_readlane_b32 s68, v63, 20
-; VI-NEXT: v_readlane_b32 s67, v63, 19
-; VI-NEXT: v_readlane_b32 s66, v63, 18
-; VI-NEXT: v_readlane_b32 s65, v63, 17
-; VI-NEXT: v_readlane_b32 s64, v63, 16
-; VI-NEXT: v_readlane_b32 s55, v63, 15
-; VI-NEXT: v_readlane_b32 s54, v63, 14
-; VI-NEXT: v_readlane_b32 s53, v63, 13
-; VI-NEXT: v_readlane_b32 s52, v63, 12
-; VI-NEXT: v_readlane_b32 s51, v63, 11
-; VI-NEXT: v_readlane_b32 s50, v63, 10
-; VI-NEXT: v_readlane_b32 s49, v63, 9
-; VI-NEXT: v_readlane_b32 s48, v63, 8
-; VI-NEXT: v_readlane_b32 s39, v63, 7
-; VI-NEXT: v_readlane_b32 s38, v63, 6
-; VI-NEXT: v_readlane_b32 s37, v63, 5
-; VI-NEXT: v_readlane_b32 s36, v63, 4
-; VI-NEXT: v_readlane_b32 s35, v63, 3
-; VI-NEXT: v_readlane_b32 s34, v63, 2
-; VI-NEXT: v_readlane_b32 s31, v63, 1
-; VI-NEXT: v_readlane_b32 s30, v63, 0
+; VI-NEXT: v_readlane_b32 s30, v63, 30
+; VI-NEXT: v_readlane_b32 s31, v63, 31
+; VI-NEXT: v_readlane_b32 s87, v63, 29
+; VI-NEXT: v_readlane_b32 s86, v63, 28
+; VI-NEXT: v_readlane_b32 s85, v63, 27
+; VI-NEXT: v_readlane_b32 s84, v63, 26
+; VI-NEXT: v_readlane_b32 s83, v63, 25
+; VI-NEXT: v_readlane_b32 s82, v63, 24
+; VI-NEXT: v_readlane_b32 s81, v63, 23
+; VI-NEXT: v_readlane_b32 s80, v63, 22
+; VI-NEXT: v_readlane_b32 s71, v63, 21
+; VI-NEXT: v_readlane_b32 s70, v63, 20
+; VI-NEXT: v_readlane_b32 s69, v63, 19
+; VI-NEXT: v_readlane_b32 s68, v63, 18
+; VI-NEXT: v_readlane_b32 s67, v63, 17
+; VI-NEXT: v_readlane_b32 s66, v63, 16
+; VI-NEXT: v_readlane_b32 s65, v63, 15
+; VI-NEXT: v_readlane_b32 s64, v63, 14
+; VI-NEXT: v_readlane_b32 s55, v63, 13
+; VI-NEXT: v_readlane_b32 s54, v63, 12
+; VI-NEXT: v_readlane_b32 s53, v63, 11
+; VI-NEXT: v_readlane_b32 s52, v63, 10
+; VI-NEXT: v_readlane_b32 s51, v63, 9
+; VI-NEXT: v_readlane_b32 s50, v63, 8
+; VI-NEXT: v_readlane_b32 s49, v63, 7
+; VI-NEXT: v_readlane_b32 s48, v63, 6
+; VI-NEXT: v_readlane_b32 s39, v63, 5
+; VI-NEXT: v_readlane_b32 s38, v63, 4
+; VI-NEXT: v_readlane_b32 s37, v63, 3
+; VI-NEXT: v_readlane_b32 s36, v63, 2
+; VI-NEXT: v_readlane_b32 s35, v63, 1
+; VI-NEXT: v_readlane_b32 s34, v63, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v4, v2, v4, s4
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -154769,41 +154769,41 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v63, s30, 0
-; GFX9-NEXT: v_writelane_b32 v63, s31, 1
-; GFX9-NEXT: v_writelane_b32 v63, s34, 2
-; GFX9-NEXT: v_writelane_b32 v63, s35, 3
-; GFX9-NEXT: v_writelane_b32 v63, s36, 4
-; GFX9-NEXT: v_writelane_b32 v63, s37, 5
-; GFX9-NEXT: v_writelane_b32 v63, s38, 6
-; GFX9-NEXT: v_writelane_b32 v63, s39, 7
-; GFX9-NEXT: v_writelane_b32 v63, s48, 8
-; GFX9-NEXT: v_writelane_b32 v63, s49, 9
-; GFX9-NEXT: v_writelane_b32 v63, s50, 10
-; GFX9-NEXT: v_writelane_b32 v63, s51, 11
-; GFX9-NEXT: v_writelane_b32 v63, s52, 12
-; GFX9-NEXT: v_writelane_b32 v63, s53, 13
-; GFX9-NEXT: v_writelane_b32 v63, s54, 14
-; GFX9-NEXT: v_writelane_b32 v63, s55, 15
-; GFX9-NEXT: v_writelane_b32 v63, s64, 16
-; GFX9-NEXT: v_writelane_b32 v63, s65, 17
-; GFX9-NEXT: v_writelane_b32 v63, s66, 18
-; GFX9-NEXT: v_writelane_b32 v63, s67, 19
-; GFX9-NEXT: v_writelane_b32 v63, s68, 20
-; GFX9-NEXT: v_writelane_b32 v63, s69, 21
-; GFX9-NEXT: v_writelane_b32 v63, s70, 22
-; GFX9-NEXT: v_writelane_b32 v63, s71, 23
-; GFX9-NEXT: v_writelane_b32 v63, s80, 24
-; GFX9-NEXT: v_writelane_b32 v63, s81, 25
-; GFX9-NEXT: v_writelane_b32 v63, s82, 26
-; GFX9-NEXT: v_writelane_b32 v63, s83, 27
-; GFX9-NEXT: v_writelane_b32 v63, s84, 28
-; GFX9-NEXT: v_writelane_b32 v63, s85, 29
-; GFX9-NEXT: v_writelane_b32 v63, s86, 30
-; GFX9-NEXT: v_writelane_b32 v63, s87, 31
-; GFX9-NEXT: v_writelane_b32 v63, s96, 32
-; GFX9-NEXT: v_writelane_b32 v63, s97, 33
-; GFX9-NEXT: v_writelane_b32 v63, s98, 34
+; GFX9-NEXT: v_writelane_b32 v63, s34, 0
+; GFX9-NEXT: v_writelane_b32 v63, s35, 1
+; GFX9-NEXT: v_writelane_b32 v63, s36, 2
+; GFX9-NEXT: v_writelane_b32 v63, s37, 3
+; GFX9-NEXT: v_writelane_b32 v63, s38, 4
+; GFX9-NEXT: v_writelane_b32 v63, s39, 5
+; GFX9-NEXT: v_writelane_b32 v63, s48, 6
+; GFX9-NEXT: v_writelane_b32 v63, s49, 7
+; GFX9-NEXT: v_writelane_b32 v63, s50, 8
+; GFX9-NEXT: v_writelane_b32 v63, s51, 9
+; GFX9-NEXT: v_writelane_b32 v63, s52, 10
+; GFX9-NEXT: v_writelane_b32 v63, s53, 11
+; GFX9-NEXT: v_writelane_b32 v63, s54, 12
+; GFX9-NEXT: v_writelane_b32 v63, s55, 13
+; GFX9-NEXT: v_writelane_b32 v63, s64, 14
+; GFX9-NEXT: v_writelane_b32 v63, s65, 15
+; GFX9-NEXT: v_writelane_b32 v63, s66, 16
+; GFX9-NEXT: v_writelane_b32 v63, s67, 17
+; GFX9-NEXT: v_writelane_b32 v63, s68, 18
+; GFX9-NEXT: v_writelane_b32 v63, s69, 19
+; GFX9-NEXT: v_writelane_b32 v63, s70, 20
+; GFX9-NEXT: v_writelane_b32 v63, s71, 21
+; GFX9-NEXT: v_writelane_b32 v63, s80, 22
+; GFX9-NEXT: v_writelane_b32 v63, s81, 23
+; GFX9-NEXT: v_writelane_b32 v63, s82, 24
+; GFX9-NEXT: v_writelane_b32 v63, s83, 25
+; GFX9-NEXT: v_writelane_b32 v63, s84, 26
+; GFX9-NEXT: v_writelane_b32 v63, s85, 27
+; GFX9-NEXT: v_writelane_b32 v63, s86, 28
+; GFX9-NEXT: v_writelane_b32 v63, s87, 29
+; GFX9-NEXT: v_writelane_b32 v63, s96, 30
+; GFX9-NEXT: v_writelane_b32 v63, s97, 31
+; GFX9-NEXT: v_writelane_b32 v63, s98, 32
+; GFX9-NEXT: v_writelane_b32 v63, s99, 33
+; GFX9-NEXT: v_writelane_b32 v63, s30, 34
; GFX9-NEXT: v_readfirstlane_b32 s56, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s16
; GFX9-NEXT: v_readfirstlane_b32 s57, v4
@@ -154833,7 +154833,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s23, v16
; GFX9-NEXT: v_mov_b32_e32 v16, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; GFX9-NEXT: v_writelane_b32 v63, s99, 35
+; GFX9-NEXT: v_writelane_b32 v63, s31, 35
; GFX9-NEXT: v_readfirstlane_b32 s20, v17
; GFX9-NEXT: v_readfirstlane_b32 s21, v18
; GFX9-NEXT: v_readfirstlane_b32 s18, v3
@@ -156241,42 +156241,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_perm_b32 v34, v56, v34, s4
; GFX9-NEXT: v_perm_b32 v28, v29, v28, s4
; GFX9-NEXT: v_perm_b32 v22, v22, v32, s4
-; GFX9-NEXT: v_readlane_b32 s99, v63, 35
-; GFX9-NEXT: v_readlane_b32 s98, v63, 34
-; GFX9-NEXT: v_readlane_b32 s97, v63, 33
-; GFX9-NEXT: v_readlane_b32 s96, v63, 32
-; GFX9-NEXT: v_readlane_b32 s87, v63, 31
-; GFX9-NEXT: v_readlane_b32 s86, v63, 30
-; GFX9-NEXT: v_readlane_b32 s85, v63, 29
-; GFX9-NEXT: v_readlane_b32 s84, v63, 28
-; GFX9-NEXT: v_readlane_b32 s83, v63, 27
-; GFX9-NEXT: v_readlane_b32 s82, v63, 26
-; GFX9-NEXT: v_readlane_b32 s81, v63, 25
-; GFX9-NEXT: v_readlane_b32 s80, v63, 24
-; GFX9-NEXT: v_readlane_b32 s71, v63, 23
-; GFX9-NEXT: v_readlane_b32 s70, v63, 22
-; GFX9-NEXT: v_readlane_b32 s69, v63, 21
-; GFX9-NEXT: v_readlane_b32 s68, v63, 20
-; GFX9-NEXT: v_readlane_b32 s67, v63, 19
-; GFX9-NEXT: v_readlane_b32 s66, v63, 18
-; GFX9-NEXT: v_readlane_b32 s65, v63, 17
-; GFX9-NEXT: v_readlane_b32 s64, v63, 16
-; GFX9-NEXT: v_readlane_b32 s55, v63, 15
-; GFX9-NEXT: v_readlane_b32 s54, v63, 14
-; GFX9-NEXT: v_readlane_b32 s53, v63, 13
-; GFX9-NEXT: v_readlane_b32 s52, v63, 12
-; GFX9-NEXT: v_readlane_b32 s51, v63, 11
-; GFX9-NEXT: v_readlane_b32 s50, v63, 10
-; GFX9-NEXT: v_readlane_b32 s49, v63, 9
-; GFX9-NEXT: v_readlane_b32 s48, v63, 8
-; GFX9-NEXT: v_readlane_b32 s39, v63, 7
-; GFX9-NEXT: v_readlane_b32 s38, v63, 6
-; GFX9-NEXT: v_readlane_b32 s37, v63, 5
-; GFX9-NEXT: v_readlane_b32 s36, v63, 4
-; GFX9-NEXT: v_readlane_b32 s35, v63, 3
-; GFX9-NEXT: v_readlane_b32 s34, v63, 2
-; GFX9-NEXT: v_readlane_b32 s31, v63, 1
-; GFX9-NEXT: v_readlane_b32 s30, v63, 0
+; GFX9-NEXT: v_readlane_b32 s30, v63, 34
+; GFX9-NEXT: v_readlane_b32 s31, v63, 35
+; GFX9-NEXT: v_readlane_b32 s99, v63, 33
+; GFX9-NEXT: v_readlane_b32 s98, v63, 32
+; GFX9-NEXT: v_readlane_b32 s97, v63, 31
+; GFX9-NEXT: v_readlane_b32 s96, v63, 30
+; GFX9-NEXT: v_readlane_b32 s87, v63, 29
+; GFX9-NEXT: v_readlane_b32 s86, v63, 28
+; GFX9-NEXT: v_readlane_b32 s85, v63, 27
+; GFX9-NEXT: v_readlane_b32 s84, v63, 26
+; GFX9-NEXT: v_readlane_b32 s83, v63, 25
+; GFX9-NEXT: v_readlane_b32 s82, v63, 24
+; GFX9-NEXT: v_readlane_b32 s81, v63, 23
+; GFX9-NEXT: v_readlane_b32 s80, v63, 22
+; GFX9-NEXT: v_readlane_b32 s71, v63, 21
+; GFX9-NEXT: v_readlane_b32 s70, v63, 20
+; GFX9-NEXT: v_readlane_b32 s69, v63, 19
+; GFX9-NEXT: v_readlane_b32 s68, v63, 18
+; GFX9-NEXT: v_readlane_b32 s67, v63, 17
+; GFX9-NEXT: v_readlane_b32 s66, v63, 16
+; GFX9-NEXT: v_readlane_b32 s65, v63, 15
+; GFX9-NEXT: v_readlane_b32 s64, v63, 14
+; GFX9-NEXT: v_readlane_b32 s55, v63, 13
+; GFX9-NEXT: v_readlane_b32 s54, v63, 12
+; GFX9-NEXT: v_readlane_b32 s53, v63, 11
+; GFX9-NEXT: v_readlane_b32 s52, v63, 10
+; GFX9-NEXT: v_readlane_b32 s51, v63, 9
+; GFX9-NEXT: v_readlane_b32 s50, v63, 8
+; GFX9-NEXT: v_readlane_b32 s49, v63, 7
+; GFX9-NEXT: v_readlane_b32 s48, v63, 6
+; GFX9-NEXT: v_readlane_b32 s39, v63, 5
+; GFX9-NEXT: v_readlane_b32 s38, v63, 4
+; GFX9-NEXT: v_readlane_b32 s37, v63, 3
+; GFX9-NEXT: v_readlane_b32 s36, v63, 2
+; GFX9-NEXT: v_readlane_b32 s35, v63, 1
+; GFX9-NEXT: v_readlane_b32 s34, v63, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v25, v25, v30, s4
; GFX9-NEXT: v_or_b32_e32 v25, v25, v26
@@ -156595,43 +156595,40 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s96, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s98, 0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s31, 1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s97, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s99, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v16
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v17
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s28, v1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s34, 2
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s98, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s100, 2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s29, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s35, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s99, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s101, 3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s36, 4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s100, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s102, 4
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s37, 5
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s101, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s39, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s103, 5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s38, 6
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s102, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s48, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s104, 6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-TRUE16-NEXT: s_mov_b32 s97, 0
; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s39, 7
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s103, 7
; GFX11-TRUE16-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
@@ -156651,33 +156648,36 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s49, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s48, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s104, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s49, 9
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s50, 10
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s51, 11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s52, 12
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s53, 13
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s54, 14
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s55, 15
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s64, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s65, 17
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s66, 18
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s67, 19
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s68, 20
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s69, 21
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s70, 22
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s71, 23
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s80, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s81, 25
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s82, 26
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s83, 27
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s84, 28
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s85, 29
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s86, 30
-; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s87, 31
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s50, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s51, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s52, 10
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s53, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s54, 12
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s55, 13
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s64, 14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s65, 15
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s66, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s67, 17
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s68, 18
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s69, 19
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s70, 20
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s71, 21
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s80, 22
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s81, 23
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s82, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s83, 25
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s84, 26
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s85, 27
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s86, 28
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s87, 29
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s96, 30
+; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s97, 31
+; GFX11-TRUE16-NEXT: s_mov_b32 s97, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB91_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24
@@ -157998,47 +157998,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:68
-; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v75, 8
-; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v75, 7
-; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v75, 6
-; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v75, 5
-; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v75, 4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v75, 3
-; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v75, 2
-; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v75, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v75, 0
-; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v74, 31
-; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v74, 30
-; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v74, 29
-; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v74, 28
-; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v74, 27
-; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v74, 26
-; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v74, 25
-; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v74, 24
-; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v74, 23
-; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v74, 22
-; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v74, 21
-; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v74, 20
-; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v74, 19
-; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v74, 18
-; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v74, 17
-; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v74, 16
-; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v74, 15
-; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v74, 14
-; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v74, 13
-; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v74, 12
-; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v74, 11
-; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v74, 10
-; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v74, 9
-; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v74, 8
-; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v74, 7
-; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v74, 6
-; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v74, 5
-; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v74, 4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v74, 3
-; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v74, 2
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v74, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v74, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v75, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v75, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v75, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v75, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v75, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v75, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v75, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v75, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v75, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v74, 31
+; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v74, 30
+; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v74, 29
+; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v74, 28
+; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v74, 27
+; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v74, 26
+; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v74, 25
+; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v74, 24
+; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v74, 23
+; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v74, 22
+; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v74, 21
+; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v74, 20
+; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v74, 19
+; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v74, 18
+; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v74, 17
+; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v74, 16
+; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v74, 15
+; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v74, 14
+; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v74, 13
+; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v74, 12
+; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v74, 11
+; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v74, 10
+; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v74, 9
+; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v74, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v74, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v74, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v74, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v74, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v74, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v74, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v74, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v74, 0
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:72
@@ -158059,43 +158059,40 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s96, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s98, 0
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s31, 1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s97, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s99, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v16
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v17
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s34, 2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s98, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s100, 2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s29, v2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s35, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s99, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s101, 3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s36, 4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s100, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s102, 4
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s37, 5
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s101, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s39, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s103, 5
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s38, 6
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s102, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s48, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s104, 6
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-FAKE16-NEXT: s_mov_b32 s97, 0
; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s39, 7
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s103, 7
; GFX11-FAKE16-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
@@ -158115,33 +158112,36 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s49, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s48, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s104, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s49, 9
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s50, 10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s51, 11
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s52, 12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s53, 13
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s54, 14
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s55, 15
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s64, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s65, 17
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s66, 18
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s67, 19
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s68, 20
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s69, 21
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s70, 22
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s71, 23
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s80, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s81, 25
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s82, 26
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s83, 27
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s84, 28
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s85, 29
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s86, 30
-; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s87, 31
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s50, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s51, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s52, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s53, 11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s54, 12
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s55, 13
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s64, 14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s65, 15
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s66, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s67, 17
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s68, 18
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s69, 19
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s70, 20
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s71, 21
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s80, 22
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s81, 23
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s82, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s83, 25
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s84, 26
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s85, 27
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s86, 28
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s87, 29
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s96, 30
+; GFX11-FAKE16-NEXT: v_writelane_b32 v74, s97, 31
+; GFX11-FAKE16-NEXT: s_mov_b32 s97, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB91_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24
@@ -159466,47 +159466,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:60
; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:64
; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:68
-; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v75, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v75, 7
-; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v75, 6
-; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v75, 5
-; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v75, 4
-; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v75, 3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v75, 2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v75, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v75, 0
-; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v74, 31
-; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v74, 30
-; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v74, 29
-; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v74, 28
-; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v74, 27
-; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v74, 26
-; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v74, 25
-; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v74, 24
-; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v74, 23
-; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v74, 22
-; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v74, 21
-; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v74, 20
-; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v74, 19
-; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v74, 18
-; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v74, 17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v74, 16
-; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v74, 15
-; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v74, 14
-; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v74, 13
-; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v74, 12
-; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v74, 11
-; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v74, 10
-; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v74, 9
-; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v74, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v74, 7
-; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v74, 6
-; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v74, 5
-; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v74, 4
-; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v74, 3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v74, 2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v74, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v74, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v75, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v75, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v75, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v75, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v75, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v75, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v75, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v75, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v75, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v74, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v74, 30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v74, 29
+; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v74, 28
+; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v74, 27
+; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v74, 26
+; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v74, 25
+; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v74, 24
+; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v74, 23
+; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v74, 22
+; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v74, 21
+; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v74, 20
+; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v74, 19
+; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v74, 18
+; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v74, 17
+; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v74, 16
+; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v74, 15
+; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v74, 14
+; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v74, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v74, 12
+; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v74, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v74, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v74, 9
+; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v74, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v74, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v74, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v74, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v74, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v74, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v74, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v74, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v74, 0
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:72
@@ -165794,7 +165794,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:300
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v41, s30, 0
+; SI-NEXT: v_writelane_b32 v41, s34, 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s29, 0
; SI-NEXT: v_writelane_b32 v43, s28, 1
@@ -165809,41 +165809,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_writelane_b32 v43, s19, 10
; SI-NEXT: v_writelane_b32 v43, s18, 11
; SI-NEXT: v_writelane_b32 v43, s17, 12
-; SI-NEXT: v_writelane_b32 v41, s31, 1
-; SI-NEXT: v_writelane_b32 v41, s34, 2
-; SI-NEXT: v_writelane_b32 v41, s35, 3
-; SI-NEXT: v_writelane_b32 v41, s36, 4
-; SI-NEXT: v_writelane_b32 v41, s37, 5
-; SI-NEXT: v_writelane_b32 v41, s38, 6
-; SI-NEXT: v_writelane_b32 v41, s39, 7
-; SI-NEXT: v_writelane_b32 v41, s48, 8
-; SI-NEXT: v_writelane_b32 v41, s49, 9
-; SI-NEXT: v_writelane_b32 v41, s50, 10
-; SI-NEXT: v_writelane_b32 v41, s51, 11
-; SI-NEXT: v_writelane_b32 v41, s52, 12
-; SI-NEXT: v_writelane_b32 v41, s53, 13
-; SI-NEXT: v_writelane_b32 v41, s54, 14
-; SI-NEXT: v_writelane_b32 v41, s55, 15
-; SI-NEXT: v_writelane_b32 v41, s64, 16
-; SI-NEXT: v_writelane_b32 v41, s65, 17
-; SI-NEXT: v_writelane_b32 v41, s66, 18
-; SI-NEXT: v_writelane_b32 v41, s67, 19
-; SI-NEXT: v_writelane_b32 v41, s68, 20
-; SI-NEXT: v_writelane_b32 v41, s69, 21
-; SI-NEXT: v_writelane_b32 v41, s70, 22
-; SI-NEXT: v_writelane_b32 v41, s71, 23
-; SI-NEXT: v_writelane_b32 v41, s80, 24
-; SI-NEXT: v_writelane_b32 v41, s81, 25
-; SI-NEXT: v_writelane_b32 v41, s82, 26
-; SI-NEXT: v_writelane_b32 v41, s83, 27
-; SI-NEXT: v_writelane_b32 v41, s84, 28
-; SI-NEXT: v_writelane_b32 v41, s85, 29
-; SI-NEXT: v_writelane_b32 v41, s86, 30
-; SI-NEXT: v_writelane_b32 v41, s87, 31
-; SI-NEXT: v_writelane_b32 v41, s96, 32
-; SI-NEXT: v_writelane_b32 v41, s97, 33
-; SI-NEXT: v_writelane_b32 v41, s98, 34
+; SI-NEXT: v_writelane_b32 v41, s35, 1
+; SI-NEXT: v_writelane_b32 v41, s36, 2
+; SI-NEXT: v_writelane_b32 v41, s37, 3
+; SI-NEXT: v_writelane_b32 v41, s38, 4
+; SI-NEXT: v_writelane_b32 v41, s39, 5
+; SI-NEXT: v_writelane_b32 v41, s48, 6
+; SI-NEXT: v_writelane_b32 v41, s49, 7
+; SI-NEXT: v_writelane_b32 v41, s50, 8
+; SI-NEXT: v_writelane_b32 v41, s51, 9
+; SI-NEXT: v_writelane_b32 v41, s52, 10
+; SI-NEXT: v_writelane_b32 v41, s53, 11
+; SI-NEXT: v_writelane_b32 v41, s54, 12
+; SI-NEXT: v_writelane_b32 v41, s55, 13
+; SI-NEXT: v_writelane_b32 v41, s64, 14
+; SI-NEXT: v_writelane_b32 v41, s65, 15
+; SI-NEXT: v_writelane_b32 v41, s66, 16
+; SI-NEXT: v_writelane_b32 v41, s67, 17
+; SI-NEXT: v_writelane_b32 v41, s68, 18
+; SI-NEXT: v_writelane_b32 v41, s69, 19
+; SI-NEXT: v_writelane_b32 v41, s70, 20
+; SI-NEXT: v_writelane_b32 v41, s71, 21
+; SI-NEXT: v_writelane_b32 v41, s80, 22
+; SI-NEXT: v_writelane_b32 v41, s81, 23
+; SI-NEXT: v_writelane_b32 v41, s82, 24
+; SI-NEXT: v_writelane_b32 v41, s83, 25
+; SI-NEXT: v_writelane_b32 v41, s84, 26
+; SI-NEXT: v_writelane_b32 v41, s85, 27
+; SI-NEXT: v_writelane_b32 v41, s86, 28
+; SI-NEXT: v_writelane_b32 v41, s87, 29
+; SI-NEXT: v_writelane_b32 v41, s96, 30
+; SI-NEXT: v_writelane_b32 v41, s97, 31
+; SI-NEXT: v_writelane_b32 v41, s98, 32
; SI-NEXT: s_mov_b32 s22, s16
+; SI-NEXT: v_writelane_b32 v41, s99, 33
+; SI-NEXT: v_writelane_b32 v41, s30, 34
+; SI-NEXT: v_writelane_b32 v41, s31, 35
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
@@ -165870,7 +165871,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s60, v27
; SI-NEXT: v_readfirstlane_b32 s61, v26
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296
@@ -165900,6 +165900,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:256
; SI-NEXT: v_readfirstlane_b32 s6, v37
; SI-NEXT: v_readfirstlane_b32 s7, v38
+; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: v_readfirstlane_b32 s14, v8
; SI-NEXT: v_readfirstlane_b32 s15, v9
; SI-NEXT: v_readfirstlane_b32 s40, v7
@@ -165913,7 +165914,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s78, v15
; SI-NEXT: v_readfirstlane_b32 s38, v13
; SI-NEXT: v_readfirstlane_b32 s39, v24
-; SI-NEXT: v_writelane_b32 v41, s99, 35
; SI-NEXT: v_readfirstlane_b32 s48, v25
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s99, v54
@@ -167285,6 +167285,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_lshl_b32 s17, s70, 16
; SI-NEXT: s_or_b32 s4, s4, s47
; SI-NEXT: s_or_b32 s5, s5, s17
+; SI-NEXT: v_readlane_b32 s30, v41, 34
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s18
; SI-NEXT: v_mov_b32_e32 v2, s19
@@ -167317,42 +167318,41 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v29, s7
; SI-NEXT: v_mov_b32_e32 v30, s4
; SI-NEXT: v_mov_b32_e32 v31, s5
-; SI-NEXT: v_readlane_b32 s99, v41, 35
-; SI-NEXT: v_readlane_b32 s98, v41, 34
-; SI-NEXT: v_readlane_b32 s97, v41, 33
-; SI-NEXT: v_readlane_b32 s96, v41, 32
-; SI-NEXT: v_readlane_b32 s87, v41, 31
-; SI-NEXT: v_readlane_b32 s86, v41, 30
-; SI-NEXT: v_readlane_b32 s85, v41, 29
-; SI-NEXT: v_readlane_b32 s84, v41, 28
-; SI-NEXT: v_readlane_b32 s83, v41, 27
-; SI-NEXT: v_readlane_b32 s82, v41, 26
-; SI-NEXT: v_readlane_b32 s81, v41, 25
-; SI-NEXT: v_readlane_b32 s80, v41, 24
-; SI-NEXT: v_readlane_b32 s71, v41, 23
-; SI-NEXT: v_readlane_b32 s70, v41, 22
-; SI-NEXT: v_readlane_b32 s69, v41, 21
-; SI-NEXT: v_readlane_b32 s68, v41, 20
-; SI-NEXT: v_readlane_b32 s67, v41, 19
-; SI-NEXT: v_readlane_b32 s66, v41, 18
-; SI-NEXT: v_readlane_b32 s65, v41, 17
-; SI-NEXT: v_readlane_b32 s64, v41, 16
-; SI-NEXT: v_readlane_b32 s55, v41, 15
-; SI-NEXT: v_readlane_b32 s54, v41, 14
-; SI-NEXT: v_readlane_b32 s53, v41, 13
-; SI-NEXT: v_readlane_b32 s52, v41, 12
-; SI-NEXT: v_readlane_b32 s51, v41, 11
-; SI-NEXT: v_readlane_b32 s50, v41, 10
-; SI-NEXT: v_readlane_b32 s49, v41, 9
-; SI-NEXT: v_readlane_b32 s48, v41, 8
-; SI-NEXT: v_readlane_b32 s39, v41, 7
-; SI-NEXT: v_readlane_b32 s38, v41, 6
-; SI-NEXT: v_readlane_b32 s37, v41, 5
-; SI-NEXT: v_readlane_b32 s36, v41, 4
-; SI-NEXT: v_readlane_b32 s35, v41, 3
-; SI-NEXT: v_readlane_b32 s34, v41, 2
-; SI-NEXT: v_readlane_b32 s31, v41, 1
-; SI-NEXT: v_readlane_b32 s30, v41, 0
+; SI-NEXT: v_readlane_b32 s31, v41, 35
+; SI-NEXT: v_readlane_b32 s99, v41, 33
+; SI-NEXT: v_readlane_b32 s98, v41, 32
+; SI-NEXT: v_readlane_b32 s97, v41, 31
+; SI-NEXT: v_readlane_b32 s96, v41, 30
+; SI-NEXT: v_readlane_b32 s87, v41, 29
+; SI-NEXT: v_readlane_b32 s86, v41, 28
+; SI-NEXT: v_readlane_b32 s85, v41, 27
+; SI-NEXT: v_readlane_b32 s84, v41, 26
+; SI-NEXT: v_readlane_b32 s83, v41, 25
+; SI-NEXT: v_readlane_b32 s82, v41, 24
+; SI-NEXT: v_readlane_b32 s81, v41, 23
+; SI-NEXT: v_readlane_b32 s80, v41, 22
+; SI-NEXT: v_readlane_b32 s71, v41, 21
+; SI-NEXT: v_readlane_b32 s70, v41, 20
+; SI-NEXT: v_readlane_b32 s69, v41, 19
+; SI-NEXT: v_readlane_b32 s68, v41, 18
+; SI-NEXT: v_readlane_b32 s67, v41, 17
+; SI-NEXT: v_readlane_b32 s66, v41, 16
+; SI-NEXT: v_readlane_b32 s65, v41, 15
+; SI-NEXT: v_readlane_b32 s64, v41, 14
+; SI-NEXT: v_readlane_b32 s55, v41, 13
+; SI-NEXT: v_readlane_b32 s54, v41, 12
+; SI-NEXT: v_readlane_b32 s53, v41, 11
+; SI-NEXT: v_readlane_b32 s52, v41, 10
+; SI-NEXT: v_readlane_b32 s51, v41, 9
+; SI-NEXT: v_readlane_b32 s50, v41, 8
+; SI-NEXT: v_readlane_b32 s49, v41, 7
+; SI-NEXT: v_readlane_b32 s48, v41, 6
+; SI-NEXT: v_readlane_b32 s39, v41, 5
+; SI-NEXT: v_readlane_b32 s38, v41, 4
+; SI-NEXT: v_readlane_b32 s37, v41, 3
+; SI-NEXT: v_readlane_b32 s36, v41, 2
+; SI-NEXT: v_readlane_b32 s35, v41, 1
+; SI-NEXT: v_readlane_b32 s34, v41, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
@@ -175472,74 +175472,72 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_writelane_b32 v34, s30, 0
-; SI-NEXT: v_writelane_b32 v34, s31, 1
-; SI-NEXT: v_writelane_b32 v34, s34, 2
-; SI-NEXT: v_writelane_b32 v34, s35, 3
-; SI-NEXT: v_writelane_b32 v34, s36, 4
-; SI-NEXT: v_writelane_b32 v34, s37, 5
-; SI-NEXT: v_writelane_b32 v34, s38, 6
-; SI-NEXT: v_writelane_b32 v34, s39, 7
-; SI-NEXT: v_writelane_b32 v34, s48, 8
-; SI-NEXT: v_writelane_b32 v34, s49, 9
-; SI-NEXT: v_writelane_b32 v34, s50, 10
-; SI-NEXT: v_writelane_b32 v34, s51, 11
-; SI-NEXT: v_writelane_b32 v34, s52, 12
-; SI-NEXT: v_writelane_b32 v34, s53, 13
-; SI-NEXT: v_writelane_b32 v34, s54, 14
-; SI-NEXT: v_writelane_b32 v34, s55, 15
-; SI-NEXT: v_writelane_b32 v34, s64, 16
-; SI-NEXT: v_writelane_b32 v34, s65, 17
-; SI-NEXT: v_writelane_b32 v34, s66, 18
-; SI-NEXT: v_writelane_b32 v34, s67, 19
-; SI-NEXT: v_writelane_b32 v34, s68, 20
-; SI-NEXT: v_writelane_b32 v34, s69, 21
-; SI-NEXT: v_writelane_b32 v34, s70, 22
+; SI-NEXT: v_writelane_b32 v34, s34, 0
+; SI-NEXT: v_writelane_b32 v34, s35, 1
+; SI-NEXT: v_writelane_b32 v34, s36, 2
+; SI-NEXT: v_writelane_b32 v34, s37, 3
+; SI-NEXT: v_writelane_b32 v34, s38, 4
+; SI-NEXT: v_writelane_b32 v34, s39, 5
+; SI-NEXT: v_writelane_b32 v34, s48, 6
+; SI-NEXT: v_writelane_b32 v34, s49, 7
+; SI-NEXT: v_writelane_b32 v34, s50, 8
+; SI-NEXT: v_writelane_b32 v34, s51, 9
+; SI-NEXT: v_writelane_b32 v34, s52, 10
+; SI-NEXT: v_writelane_b32 v34, s53, 11
+; SI-NEXT: v_writelane_b32 v34, s54, 12
+; SI-NEXT: v_writelane_b32 v34, s55, 13
+; SI-NEXT: v_writelane_b32 v34, s64, 14
+; SI-NEXT: v_writelane_b32 v34, s65, 15
+; SI-NEXT: v_writelane_b32 v34, s66, 16
+; SI-NEXT: v_writelane_b32 v34, s67, 17
+; SI-NEXT: v_writelane_b32 v34, s68, 18
+; SI-NEXT: v_writelane_b32 v34, s69, 19
+; SI-NEXT: v_writelane_b32 v34, s70, 20
; SI-NEXT: s_lshr_b32 s6, s20, 16
; SI-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v34, s71, 23
+; SI-NEXT: v_writelane_b32 v34, s71, 21
; SI-NEXT: s_lshr_b32 s7, s22, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v37, s6, 0
-; SI-NEXT: v_writelane_b32 v34, s80, 24
+; SI-NEXT: v_writelane_b32 v34, s80, 22
; SI-NEXT: s_lshr_b32 s8, s24, 16
; SI-NEXT: v_writelane_b32 v37, s7, 2
-; SI-NEXT: v_writelane_b32 v34, s81, 25
+; SI-NEXT: v_writelane_b32 v34, s81, 23
; SI-NEXT: s_lshr_b32 s9, s26, 16
; SI-NEXT: v_writelane_b32 v37, s8, 4
-; SI-NEXT: v_writelane_b32 v34, s82, 26
+; SI-NEXT: v_writelane_b32 v34, s82, 24
; SI-NEXT: s_lshr_b32 s10, s28, 16
; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1
; SI-NEXT: v_writelane_b32 v37, s9, 6
-; SI-NEXT: v_writelane_b32 v34, s83, 27
+; SI-NEXT: v_writelane_b32 v34, s83, 25
; SI-NEXT: v_readfirstlane_b32 s11, v20
; SI-NEXT: v_writelane_b32 v37, s10, 8
-; SI-NEXT: v_writelane_b32 v34, s84, 28
+; SI-NEXT: v_writelane_b32 v34, s84, 26
; SI-NEXT: v_readfirstlane_b32 s12, v3
; SI-NEXT: v_writelane_b32 v37, s11, 10
-; SI-NEXT: v_writelane_b32 v34, s85, 29
+; SI-NEXT: v_writelane_b32 v34, s85, 27
; SI-NEXT: v_readfirstlane_b32 s13, v8
; SI-NEXT: v_writelane_b32 v37, s12, 11
-; SI-NEXT: v_writelane_b32 v34, s86, 30
+; SI-NEXT: v_writelane_b32 v34, s86, 28
; SI-NEXT: v_readfirstlane_b32 s14, v7
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
; SI-NEXT: v_readfirstlane_b32 s71, v4
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; SI-NEXT: v_writelane_b32 v37, s13, 12
-; SI-NEXT: v_writelane_b32 v34, s87, 31
+; SI-NEXT: v_writelane_b32 v34, s87, 29
; SI-NEXT: v_readfirstlane_b32 s15, v4
; SI-NEXT: v_writelane_b32 v37, s14, 13
-; SI-NEXT: v_writelane_b32 v34, s96, 32
+; SI-NEXT: v_writelane_b32 v34, s96, 30
; SI-NEXT: v_readfirstlane_b32 s93, v10
; SI-NEXT: v_writelane_b32 v37, s15, 14
-; SI-NEXT: v_writelane_b32 v34, s97, 33
+; SI-NEXT: v_writelane_b32 v34, s97, 31
; SI-NEXT: v_readfirstlane_b32 s34, v12
; SI-NEXT: v_writelane_b32 v37, s93, 15
-; SI-NEXT: v_writelane_b32 v34, s98, 34
+; SI-NEXT: v_writelane_b32 v34, s98, 32
; SI-NEXT: v_readfirstlane_b32 s38, v11
; SI-NEXT: v_writelane_b32 v37, s34, 16
-; SI-NEXT: v_writelane_b32 v34, s99, 35
+; SI-NEXT: v_writelane_b32 v34, s99, 33
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: s_lshr_b32 s90, s29, 16
; SI-NEXT: s_lshr_b32 s89, s27, 16
@@ -175555,6 +175553,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 vcc_lo, v7
; SI-NEXT: v_writelane_b32 v37, s18, 18
; SI-NEXT: v_writelane_b32 v37, vcc_lo, 19
+; SI-NEXT: v_writelane_b32 v34, s30, 34
; SI-NEXT: v_readfirstlane_b32 s94, v18
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; SI-NEXT: v_readfirstlane_b32 s64, v17
@@ -175579,6 +175578,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: v_writelane_b32 v37, s21, 20
+; SI-NEXT: v_writelane_b32 v34, s31, 35
; SI-NEXT: v_readfirstlane_b32 s35, v2
; SI-NEXT: v_readfirstlane_b32 s39, v1
; SI-NEXT: v_readfirstlane_b32 s95, v18
@@ -177124,48 +177124,48 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_or_b32_e32 v1, s4, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: v_readlane_b32 s30, v34, 34
; SI-NEXT: v_readlane_b32 s19, v36, 24
; SI-NEXT: v_readlane_b32 s17, v36, 30
; SI-NEXT: v_readlane_b32 s13, v36, 42
; SI-NEXT: v_readlane_b32 s11, v37, 7
; SI-NEXT: v_readlane_b32 s9, v37, 1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s99, v34, 35
-; SI-NEXT: v_readlane_b32 s98, v34, 34
-; SI-NEXT: v_readlane_b32 s97, v34, 33
-; SI-NEXT: v_readlane_b32 s96, v34, 32
-; SI-NEXT: v_readlane_b32 s87, v34, 31
-; SI-NEXT: v_readlane_b32 s86, v34, 30
-; SI-NEXT: v_readlane_b32 s85, v34, 29
-; SI-NEXT: v_readlane_b32 s84, v34, 28
-; SI-NEXT: v_readlane_b32 s83, v34, 27
-; SI-NEXT: v_readlane_b32 s82, v34, 26
-; SI-NEXT: v_readlane_b32 s81, v34, 25
-; SI-NEXT: v_readlane_b32 s80, v34, 24
-; SI-NEXT: v_readlane_b32 s71, v34, 23
-; SI-NEXT: v_readlane_b32 s70, v34, 22
-; SI-NEXT: v_readlane_b32 s69, v34, 21
-; SI-NEXT: v_readlane_b32 s68, v34, 20
-; SI-NEXT: v_readlane_b32 s67, v34, 19
-; SI-NEXT: v_readlane_b32 s66, v34, 18
-; SI-NEXT: v_readlane_b32 s65, v34, 17
-; SI-NEXT: v_readlane_b32 s64, v34, 16
-; SI-NEXT: v_readlane_b32 s55, v34, 15
-; SI-NEXT: v_readlane_b32 s54, v34, 14
-; SI-NEXT: v_readlane_b32 s53, v34, 13
-; SI-NEXT: v_readlane_b32 s52, v34, 12
-; SI-NEXT: v_readlane_b32 s51, v34, 11
-; SI-NEXT: v_readlane_b32 s50, v34, 10
-; SI-NEXT: v_readlane_b32 s49, v34, 9
-; SI-NEXT: v_readlane_b32 s48, v34, 8
-; SI-NEXT: v_readlane_b32 s39, v34, 7
-; SI-NEXT: v_readlane_b32 s38, v34, 6
-; SI-NEXT: v_readlane_b32 s37, v34, 5
-; SI-NEXT: v_readlane_b32 s36, v34, 4
-; SI-NEXT: v_readlane_b32 s35, v34, 3
-; SI-NEXT: v_readlane_b32 s34, v34, 2
-; SI-NEXT: v_readlane_b32 s31, v34, 1
-; SI-NEXT: v_readlane_b32 s30, v34, 0
+; SI-NEXT: v_readlane_b32 s31, v34, 35
+; SI-NEXT: v_readlane_b32 s99, v34, 33
+; SI-NEXT: v_readlane_b32 s98, v34, 32
+; SI-NEXT: v_readlane_b32 s97, v34, 31
+; SI-NEXT: v_readlane_b32 s96, v34, 30
+; SI-NEXT: v_readlane_b32 s87, v34, 29
+; SI-NEXT: v_readlane_b32 s86, v34, 28
+; SI-NEXT: v_readlane_b32 s85, v34, 27
+; SI-NEXT: v_readlane_b32 s84, v34, 26
+; SI-NEXT: v_readlane_b32 s83, v34, 25
+; SI-NEXT: v_readlane_b32 s82, v34, 24
+; SI-NEXT: v_readlane_b32 s81, v34, 23
+; SI-NEXT: v_readlane_b32 s80, v34, 22
+; SI-NEXT: v_readlane_b32 s71, v34, 21
+; SI-NEXT: v_readlane_b32 s70, v34, 20
+; SI-NEXT: v_readlane_b32 s69, v34, 19
+; SI-NEXT: v_readlane_b32 s68, v34, 18
+; SI-NEXT: v_readlane_b32 s67, v34, 17
+; SI-NEXT: v_readlane_b32 s66, v34, 16
+; SI-NEXT: v_readlane_b32 s65, v34, 15
+; SI-NEXT: v_readlane_b32 s64, v34, 14
+; SI-NEXT: v_readlane_b32 s55, v34, 13
+; SI-NEXT: v_readlane_b32 s54, v34, 12
+; SI-NEXT: v_readlane_b32 s53, v34, 11
+; SI-NEXT: v_readlane_b32 s52, v34, 10
+; SI-NEXT: v_readlane_b32 s51, v34, 9
+; SI-NEXT: v_readlane_b32 s50, v34, 8
+; SI-NEXT: v_readlane_b32 s49, v34, 7
+; SI-NEXT: v_readlane_b32 s48, v34, 6
+; SI-NEXT: v_readlane_b32 s39, v34, 5
+; SI-NEXT: v_readlane_b32 s38, v34, 4
+; SI-NEXT: v_readlane_b32 s37, v34, 3
+; SI-NEXT: v_readlane_b32 s36, v34, 2
+; SI-NEXT: v_readlane_b32 s35, v34, 1
+; SI-NEXT: v_readlane_b32 s34, v34, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -177182,37 +177182,37 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v63, s30, 0
-; VI-NEXT: v_writelane_b32 v63, s31, 1
-; VI-NEXT: v_writelane_b32 v63, s34, 2
-; VI-NEXT: v_writelane_b32 v63, s35, 3
-; VI-NEXT: v_writelane_b32 v63, s36, 4
-; VI-NEXT: v_writelane_b32 v63, s37, 5
-; VI-NEXT: v_writelane_b32 v63, s38, 6
-; VI-NEXT: v_writelane_b32 v63, s39, 7
-; VI-NEXT: v_writelane_b32 v63, s48, 8
-; VI-NEXT: v_writelane_b32 v63, s49, 9
-; VI-NEXT: v_writelane_b32 v63, s50, 10
-; VI-NEXT: v_writelane_b32 v63, s51, 11
-; VI-NEXT: v_writelane_b32 v63, s52, 12
-; VI-NEXT: v_writelane_b32 v63, s53, 13
-; VI-NEXT: v_writelane_b32 v63, s54, 14
-; VI-NEXT: v_writelane_b32 v63, s55, 15
-; VI-NEXT: v_writelane_b32 v63, s64, 16
-; VI-NEXT: v_writelane_b32 v63, s65, 17
-; VI-NEXT: v_writelane_b32 v63, s66, 18
-; VI-NEXT: v_writelane_b32 v63, s67, 19
-; VI-NEXT: v_writelane_b32 v63, s68, 20
-; VI-NEXT: v_writelane_b32 v63, s69, 21
-; VI-NEXT: v_writelane_b32 v63, s70, 22
-; VI-NEXT: v_writelane_b32 v63, s71, 23
-; VI-NEXT: v_writelane_b32 v63, s80, 24
-; VI-NEXT: v_writelane_b32 v63, s81, 25
-; VI-NEXT: v_writelane_b32 v63, s82, 26
-; VI-NEXT: v_writelane_b32 v63, s83, 27
-; VI-NEXT: v_writelane_b32 v63, s84, 28
-; VI-NEXT: v_writelane_b32 v63, s85, 29
-; VI-NEXT: v_writelane_b32 v63, s86, 30
+; VI-NEXT: v_writelane_b32 v63, s34, 0
+; VI-NEXT: v_writelane_b32 v63, s35, 1
+; VI-NEXT: v_writelane_b32 v63, s36, 2
+; VI-NEXT: v_writelane_b32 v63, s37, 3
+; VI-NEXT: v_writelane_b32 v63, s38, 4
+; VI-NEXT: v_writelane_b32 v63, s39, 5
+; VI-NEXT: v_writelane_b32 v63, s48, 6
+; VI-NEXT: v_writelane_b32 v63, s49, 7
+; VI-NEXT: v_writelane_b32 v63, s50, 8
+; VI-NEXT: v_writelane_b32 v63, s51, 9
+; VI-NEXT: v_writelane_b32 v63, s52, 10
+; VI-NEXT: v_writelane_b32 v63, s53, 11
+; VI-NEXT: v_writelane_b32 v63, s54, 12
+; VI-NEXT: v_writelane_b32 v63, s55, 13
+; VI-NEXT: v_writelane_b32 v63, s64, 14
+; VI-NEXT: v_writelane_b32 v63, s65, 15
+; VI-NEXT: v_writelane_b32 v63, s66, 16
+; VI-NEXT: v_writelane_b32 v63, s67, 17
+; VI-NEXT: v_writelane_b32 v63, s68, 18
+; VI-NEXT: v_writelane_b32 v63, s69, 19
+; VI-NEXT: v_writelane_b32 v63, s70, 20
+; VI-NEXT: v_writelane_b32 v63, s71, 21
+; VI-NEXT: v_writelane_b32 v63, s80, 22
+; VI-NEXT: v_writelane_b32 v63, s81, 23
+; VI-NEXT: v_writelane_b32 v63, s82, 24
+; VI-NEXT: v_writelane_b32 v63, s83, 25
+; VI-NEXT: v_writelane_b32 v63, s84, 26
+; VI-NEXT: v_writelane_b32 v63, s85, 27
+; VI-NEXT: v_writelane_b32 v63, s86, 28
+; VI-NEXT: v_writelane_b32 v63, s87, 29
+; VI-NEXT: v_writelane_b32 v63, s30, 30
; VI-NEXT: v_readfirstlane_b32 s56, v3
; VI-NEXT: v_mov_b32_e32 v3, s16
; VI-NEXT: v_readfirstlane_b32 s57, v4
@@ -177242,7 +177242,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s23, v16
; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; VI-NEXT: v_writelane_b32 v63, s87, 31
+; VI-NEXT: v_writelane_b32 v63, s31, 31
; VI-NEXT: v_readfirstlane_b32 s20, v17
; VI-NEXT: v_readfirstlane_b32 s21, v18
; VI-NEXT: v_readfirstlane_b32 s18, v3
@@ -178163,38 +178163,38 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_perm_b32 v9, v47, v9, s4
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_readlane_b32 s87, v63, 31
-; VI-NEXT: v_readlane_b32 s86, v63, 30
-; VI-NEXT: v_readlane_b32 s85, v63, 29
-; VI-NEXT: v_readlane_b32 s84, v63, 28
-; VI-NEXT: v_readlane_b32 s83, v63, 27
-; VI-NEXT: v_readlane_b32 s82, v63, 26
-; VI-NEXT: v_readlane_b32 s81, v63, 25
-; VI-NEXT: v_readlane_b32 s80, v63, 24
-; VI-NEXT: v_readlane_b32 s71, v63, 23
-; VI-NEXT: v_readlane_b32 s70, v63, 22
-; VI-NEXT: v_readlane_b32 s69, v63, 21
-; VI-NEXT: v_readlane_b32 s68, v63, 20
-; VI-NEXT: v_readlane_b32 s67, v63, 19
-; VI-NEXT: v_readlane_b32 s66, v63, 18
-; VI-NEXT: v_readlane_b32 s65, v63, 17
-; VI-NEXT: v_readlane_b32 s64, v63, 16
-; VI-NEXT: v_readlane_b32 s55, v63, 15
-; VI-NEXT: v_readlane_b32 s54, v63, 14
-; VI-NEXT: v_readlane_b32 s53, v63, 13
-; VI-NEXT: v_readlane_b32 s52, v63, 12
-; VI-NEXT: v_readlane_b32 s51, v63, 11
-; VI-NEXT: v_readlane_b32 s50, v63, 10
-; VI-NEXT: v_readlane_b32 s49, v63, 9
-; VI-NEXT: v_readlane_b32 s48, v63, 8
-; VI-NEXT: v_readlane_b32 s39, v63, 7
-; VI-NEXT: v_readlane_b32 s38, v63, 6
-; VI-NEXT: v_readlane_b32 s37, v63, 5
-; VI-NEXT: v_readlane_b32 s36, v63, 4
-; VI-NEXT: v_readlane_b32 s35, v63, 3
-; VI-NEXT: v_readlane_b32 s34, v63, 2
-; VI-NEXT: v_readlane_b32 s31, v63, 1
-; VI-NEXT: v_readlane_b32 s30, v63, 0
+; VI-NEXT: v_readlane_b32 s30, v63, 30
+; VI-NEXT: v_readlane_b32 s31, v63, 31
+; VI-NEXT: v_readlane_b32 s87, v63, 29
+; VI-NEXT: v_readlane_b32 s86, v63, 28
+; VI-NEXT: v_readlane_b32 s85, v63, 27
+; VI-NEXT: v_readlane_b32 s84, v63, 26
+; VI-NEXT: v_readlane_b32 s83, v63, 25
+; VI-NEXT: v_readlane_b32 s82, v63, 24
+; VI-NEXT: v_readlane_b32 s81, v63, 23
+; VI-NEXT: v_readlane_b32 s80, v63, 22
+; VI-NEXT: v_readlane_b32 s71, v63, 21
+; VI-NEXT: v_readlane_b32 s70, v63, 20
+; VI-NEXT: v_readlane_b32 s69, v63, 19
+; VI-NEXT: v_readlane_b32 s68, v63, 18
+; VI-NEXT: v_readlane_b32 s67, v63, 17
+; VI-NEXT: v_readlane_b32 s66, v63, 16
+; VI-NEXT: v_readlane_b32 s65, v63, 15
+; VI-NEXT: v_readlane_b32 s64, v63, 14
+; VI-NEXT: v_readlane_b32 s55, v63, 13
+; VI-NEXT: v_readlane_b32 s54, v63, 12
+; VI-NEXT: v_readlane_b32 s53, v63, 11
+; VI-NEXT: v_readlane_b32 s52, v63, 10
+; VI-NEXT: v_readlane_b32 s51, v63, 9
+; VI-NEXT: v_readlane_b32 s50, v63, 8
+; VI-NEXT: v_readlane_b32 s49, v63, 7
+; VI-NEXT: v_readlane_b32 s48, v63, 6
+; VI-NEXT: v_readlane_b32 s39, v63, 5
+; VI-NEXT: v_readlane_b32 s38, v63, 4
+; VI-NEXT: v_readlane_b32 s37, v63, 3
+; VI-NEXT: v_readlane_b32 s36, v63, 2
+; VI-NEXT: v_readlane_b32 s35, v63, 1
+; VI-NEXT: v_readlane_b32 s34, v63, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v56, v23, v56, s4
; VI-NEXT: v_or_b32_e32 v20, v56, v20
@@ -179429,43 +179429,43 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v74, s30, 0
-; GFX11-NEXT: v_writelane_b32 v75, s96, 0
+; GFX11-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-NEXT: v_writelane_b32 v75, s98, 0
; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT: v_writelane_b32 v74, s31, 1
-; GFX11-NEXT: v_writelane_b32 v75, s97, 1
+; GFX11-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-NEXT: v_writelane_b32 v75, s99, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_readfirstlane_b32 s40, v16
; GFX11-NEXT: v_readfirstlane_b32 s41, v17
; GFX11-NEXT: v_readfirstlane_b32 s28, v1
-; GFX11-NEXT: v_writelane_b32 v74, s34, 2
-; GFX11-NEXT: v_writelane_b32 v75, s98, 2
+; GFX11-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-NEXT: v_writelane_b32 v75, s100, 2
; GFX11-NEXT: v_readfirstlane_b32 s29, v2
; GFX11-NEXT: v_readfirstlane_b32 s14, v3
; GFX11-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-NEXT: v_writelane_b32 v74, s35, 3
-; GFX11-NEXT: v_writelane_b32 v75, s99, 3
+; GFX11-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-NEXT: v_writelane_b32 v75, s101, 3
; GFX11-NEXT: v_readfirstlane_b32 s12, v5
; GFX11-NEXT: v_readfirstlane_b32 s13, v6
; GFX11-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-NEXT: v_writelane_b32 v74, s36, 4
-; GFX11-NEXT: v_writelane_b32 v75, s100, 4
+; GFX11-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-NEXT: v_writelane_b32 v75, s102, 4
; GFX11-NEXT: v_readfirstlane_b32 s11, v8
; GFX11-NEXT: v_readfirstlane_b32 s8, v9
; GFX11-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-NEXT: v_writelane_b32 v74, s37, 5
-; GFX11-NEXT: v_writelane_b32 v75, s101, 5
+; GFX11-NEXT: v_writelane_b32 v74, s39, 5
+; GFX11-NEXT: v_writelane_b32 v75, s103, 5
; GFX11-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-NEXT: v_readfirstlane_b32 s7, v12
; GFX11-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-NEXT: v_writelane_b32 v74, s38, 6
-; GFX11-NEXT: v_writelane_b32 v75, s102, 6
+; GFX11-NEXT: v_writelane_b32 v74, s48, 6
+; GFX11-NEXT: v_writelane_b32 v75, s104, 6
; GFX11-NEXT: v_readfirstlane_b32 s5, v14
; GFX11-NEXT: s_mov_b32 s99, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: v_writelane_b32 v74, s39, 7
-; GFX11-NEXT: v_writelane_b32 v75, s103, 7
+; GFX11-NEXT: v_writelane_b32 v74, s49, 7
+; GFX11-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64
@@ -179487,31 +179487,31 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v73, s32
; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-NEXT: v_writelane_b32 v74, s48, 8
-; GFX11-NEXT: v_writelane_b32 v75, s104, 8
-; GFX11-NEXT: v_writelane_b32 v74, s49, 9
-; GFX11-NEXT: v_writelane_b32 v74, s50, 10
-; GFX11-NEXT: v_writelane_b32 v74, s51, 11
-; GFX11-NEXT: v_writelane_b32 v74, s52, 12
-; GFX11-NEXT: v_writelane_b32 v74, s53, 13
-; GFX11-NEXT: v_writelane_b32 v74, s54, 14
-; GFX11-NEXT: v_writelane_b32 v74, s55, 15
-; GFX11-NEXT: v_writelane_b32 v74, s64, 16
-; GFX11-NEXT: v_writelane_b32 v74, s65, 17
-; GFX11-NEXT: v_writelane_b32 v74, s66, 18
-; GFX11-NEXT: v_writelane_b32 v74, s67, 19
-; GFX11-NEXT: v_writelane_b32 v74, s68, 20
-; GFX11-NEXT: v_writelane_b32 v74, s69, 21
-; GFX11-NEXT: v_writelane_b32 v74, s70, 22
-; GFX11-NEXT: v_writelane_b32 v74, s71, 23
-; GFX11-NEXT: v_writelane_b32 v74, s80, 24
-; GFX11-NEXT: v_writelane_b32 v74, s81, 25
-; GFX11-NEXT: v_writelane_b32 v74, s82, 26
-; GFX11-NEXT: v_writelane_b32 v74, s83, 27
-; GFX11-NEXT: v_writelane_b32 v74, s84, 28
-; GFX11-NEXT: v_writelane_b32 v74, s85, 29
-; GFX11-NEXT: v_writelane_b32 v74, s86, 30
-; GFX11-NEXT: v_writelane_b32 v74, s87, 31
+; GFX11-NEXT: v_writelane_b32 v74, s50, 8
+; GFX11-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-NEXT: v_writelane_b32 v74, s51, 9
+; GFX11-NEXT: v_writelane_b32 v74, s52, 10
+; GFX11-NEXT: v_writelane_b32 v74, s53, 11
+; GFX11-NEXT: v_writelane_b32 v74, s54, 12
+; GFX11-NEXT: v_writelane_b32 v74, s55, 13
+; GFX11-NEXT: v_writelane_b32 v74, s64, 14
+; GFX11-NEXT: v_writelane_b32 v74, s65, 15
+; GFX11-NEXT: v_writelane_b32 v74, s66, 16
+; GFX11-NEXT: v_writelane_b32 v74, s67, 17
+; GFX11-NEXT: v_writelane_b32 v74, s68, 18
+; GFX11-NEXT: v_writelane_b32 v74, s69, 19
+; GFX11-NEXT: v_writelane_b32 v74, s70, 20
+; GFX11-NEXT: v_writelane_b32 v74, s71, 21
+; GFX11-NEXT: v_writelane_b32 v74, s80, 22
+; GFX11-NEXT: v_writelane_b32 v74, s81, 23
+; GFX11-NEXT: v_writelane_b32 v74, s82, 24
+; GFX11-NEXT: v_writelane_b32 v74, s83, 25
+; GFX11-NEXT: v_writelane_b32 v74, s84, 26
+; GFX11-NEXT: v_writelane_b32 v74, s85, 27
+; GFX11-NEXT: v_writelane_b32 v74, s86, 28
+; GFX11-NEXT: v_writelane_b32 v74, s87, 29
+; GFX11-NEXT: v_writelane_b32 v74, s96, 30
+; GFX11-NEXT: v_writelane_b32 v74, s97, 31
; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 16
@@ -180239,47 +180239,47 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT: v_readlane_b32 s104, v75, 8
-; GFX11-NEXT: v_readlane_b32 s103, v75, 7
-; GFX11-NEXT: v_readlane_b32 s102, v75, 6
-; GFX11-NEXT: v_readlane_b32 s101, v75, 5
-; GFX11-NEXT: v_readlane_b32 s100, v75, 4
-; GFX11-NEXT: v_readlane_b32 s99, v75, 3
-; GFX11-NEXT: v_readlane_b32 s98, v75, 2
-; GFX11-NEXT: v_readlane_b32 s97, v75, 1
-; GFX11-NEXT: v_readlane_b32 s96, v75, 0
-; GFX11-NEXT: v_readlane_b32 s87, v74, 31
-; GFX11-NEXT: v_readlane_b32 s86, v74, 30
-; GFX11-NEXT: v_readlane_b32 s85, v74, 29
-; GFX11-NEXT: v_readlane_b32 s84, v74, 28
-; GFX11-NEXT: v_readlane_b32 s83, v74, 27
-; GFX11-NEXT: v_readlane_b32 s82, v74, 26
-; GFX11-NEXT: v_readlane_b32 s81, v74, 25
-; GFX11-NEXT: v_readlane_b32 s80, v74, 24
-; GFX11-NEXT: v_readlane_b32 s71, v74, 23
-; GFX11-NEXT: v_readlane_b32 s70, v74, 22
-; GFX11-NEXT: v_readlane_b32 s69, v74, 21
-; GFX11-NEXT: v_readlane_b32 s68, v74, 20
-; GFX11-NEXT: v_readlane_b32 s67, v74, 19
-; GFX11-NEXT: v_readlane_b32 s66, v74, 18
-; GFX11-NEXT: v_readlane_b32 s65, v74, 17
-; GFX11-NEXT: v_readlane_b32 s64, v74, 16
-; GFX11-NEXT: v_readlane_b32 s55, v74, 15
-; GFX11-NEXT: v_readlane_b32 s54, v74, 14
-; GFX11-NEXT: v_readlane_b32 s53, v74, 13
-; GFX11-NEXT: v_readlane_b32 s52, v74, 12
-; GFX11-NEXT: v_readlane_b32 s51, v74, 11
-; GFX11-NEXT: v_readlane_b32 s50, v74, 10
-; GFX11-NEXT: v_readlane_b32 s49, v74, 9
-; GFX11-NEXT: v_readlane_b32 s48, v74, 8
-; GFX11-NEXT: v_readlane_b32 s39, v74, 7
-; GFX11-NEXT: v_readlane_b32 s38, v74, 6
-; GFX11-NEXT: v_readlane_b32 s37, v74, 5
-; GFX11-NEXT: v_readlane_b32 s36, v74, 4
-; GFX11-NEXT: v_readlane_b32 s35, v74, 3
-; GFX11-NEXT: v_readlane_b32 s34, v74, 2
-; GFX11-NEXT: v_readlane_b32 s31, v74, 1
-; GFX11-NEXT: v_readlane_b32 s30, v74, 0
+; GFX11-NEXT: v_readlane_b32 s30, v75, 7
+; GFX11-NEXT: v_readlane_b32 s31, v75, 8
+; GFX11-NEXT: v_readlane_b32 s104, v75, 6
+; GFX11-NEXT: v_readlane_b32 s103, v75, 5
+; GFX11-NEXT: v_readlane_b32 s102, v75, 4
+; GFX11-NEXT: v_readlane_b32 s101, v75, 3
+; GFX11-NEXT: v_readlane_b32 s100, v75, 2
+; GFX11-NEXT: v_readlane_b32 s99, v75, 1
+; GFX11-NEXT: v_readlane_b32 s98, v75, 0
+; GFX11-NEXT: v_readlane_b32 s97, v74, 31
+; GFX11-NEXT: v_readlane_b32 s96, v74, 30
+; GFX11-NEXT: v_readlane_b32 s87, v74, 29
+; GFX11-NEXT: v_readlane_b32 s86, v74, 28
+; GFX11-NEXT: v_readlane_b32 s85, v74, 27
+; GFX11-NEXT: v_readlane_b32 s84, v74, 26
+; GFX11-NEXT: v_readlane_b32 s83, v74, 25
+; GFX11-NEXT: v_readlane_b32 s82, v74, 24
+; GFX11-NEXT: v_readlane_b32 s81, v74, 23
+; GFX11-NEXT: v_readlane_b32 s80, v74, 22
+; GFX11-NEXT: v_readlane_b32 s71, v74, 21
+; GFX11-NEXT: v_readlane_b32 s70, v74, 20
+; GFX11-NEXT: v_readlane_b32 s69, v74, 19
+; GFX11-NEXT: v_readlane_b32 s68, v74, 18
+; GFX11-NEXT: v_readlane_b32 s67, v74, 17
+; GFX11-NEXT: v_readlane_b32 s66, v74, 16
+; GFX11-NEXT: v_readlane_b32 s65, v74, 15
+; GFX11-NEXT: v_readlane_b32 s64, v74, 14
+; GFX11-NEXT: v_readlane_b32 s55, v74, 13
+; GFX11-NEXT: v_readlane_b32 s54, v74, 12
+; GFX11-NEXT: v_readlane_b32 s53, v74, 11
+; GFX11-NEXT: v_readlane_b32 s52, v74, 10
+; GFX11-NEXT: v_readlane_b32 s51, v74, 9
+; GFX11-NEXT: v_readlane_b32 s50, v74, 8
+; GFX11-NEXT: v_readlane_b32 s49, v74, 7
+; GFX11-NEXT: v_readlane_b32 s48, v74, 6
+; GFX11-NEXT: v_readlane_b32 s39, v74, 5
+; GFX11-NEXT: v_readlane_b32 s38, v74, 4
+; GFX11-NEXT: v_readlane_b32 s37, v74, 3
+; GFX11-NEXT: v_readlane_b32 s36, v74, 2
+; GFX11-NEXT: v_readlane_b32 s35, v74, 1
+; GFX11-NEXT: v_readlane_b32 s34, v74, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72
@@ -186567,7 +186567,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:300
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v41, s30, 0
+; SI-NEXT: v_writelane_b32 v41, s34, 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s29, 0
; SI-NEXT: v_writelane_b32 v43, s28, 1
@@ -186582,41 +186582,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v43, s19, 10
; SI-NEXT: v_writelane_b32 v43, s18, 11
; SI-NEXT: v_writelane_b32 v43, s17, 12
-; SI-NEXT: v_writelane_b32 v41, s31, 1
-; SI-NEXT: v_writelane_b32 v41, s34, 2
-; SI-NEXT: v_writelane_b32 v41, s35, 3
-; SI-NEXT: v_writelane_b32 v41, s36, 4
-; SI-NEXT: v_writelane_b32 v41, s37, 5
-; SI-NEXT: v_writelane_b32 v41, s38, 6
-; SI-NEXT: v_writelane_b32 v41, s39, 7
-; SI-NEXT: v_writelane_b32 v41, s48, 8
-; SI-NEXT: v_writelane_b32 v41, s49, 9
-; SI-NEXT: v_writelane_b32 v41, s50, 10
-; SI-NEXT: v_writelane_b32 v41, s51, 11
-; SI-NEXT: v_writelane_b32 v41, s52, 12
-; SI-NEXT: v_writelane_b32 v41, s53, 13
-; SI-NEXT: v_writelane_b32 v41, s54, 14
-; SI-NEXT: v_writelane_b32 v41, s55, 15
-; SI-NEXT: v_writelane_b32 v41, s64, 16
-; SI-NEXT: v_writelane_b32 v41, s65, 17
-; SI-NEXT: v_writelane_b32 v41, s66, 18
-; SI-NEXT: v_writelane_b32 v41, s67, 19
-; SI-NEXT: v_writelane_b32 v41, s68, 20
-; SI-NEXT: v_writelane_b32 v41, s69, 21
-; SI-NEXT: v_writelane_b32 v41, s70, 22
-; SI-NEXT: v_writelane_b32 v41, s71, 23
-; SI-NEXT: v_writelane_b32 v41, s80, 24
-; SI-NEXT: v_writelane_b32 v41, s81, 25
-; SI-NEXT: v_writelane_b32 v41, s82, 26
-; SI-NEXT: v_writelane_b32 v41, s83, 27
-; SI-NEXT: v_writelane_b32 v41, s84, 28
-; SI-NEXT: v_writelane_b32 v41, s85, 29
-; SI-NEXT: v_writelane_b32 v41, s86, 30
-; SI-NEXT: v_writelane_b32 v41, s87, 31
-; SI-NEXT: v_writelane_b32 v41, s96, 32
-; SI-NEXT: v_writelane_b32 v41, s97, 33
-; SI-NEXT: v_writelane_b32 v41, s98, 34
+; SI-NEXT: v_writelane_b32 v41, s35, 1
+; SI-NEXT: v_writelane_b32 v41, s36, 2
+; SI-NEXT: v_writelane_b32 v41, s37, 3
+; SI-NEXT: v_writelane_b32 v41, s38, 4
+; SI-NEXT: v_writelane_b32 v41, s39, 5
+; SI-NEXT: v_writelane_b32 v41, s48, 6
+; SI-NEXT: v_writelane_b32 v41, s49, 7
+; SI-NEXT: v_writelane_b32 v41, s50, 8
+; SI-NEXT: v_writelane_b32 v41, s51, 9
+; SI-NEXT: v_writelane_b32 v41, s52, 10
+; SI-NEXT: v_writelane_b32 v41, s53, 11
+; SI-NEXT: v_writelane_b32 v41, s54, 12
+; SI-NEXT: v_writelane_b32 v41, s55, 13
+; SI-NEXT: v_writelane_b32 v41, s64, 14
+; SI-NEXT: v_writelane_b32 v41, s65, 15
+; SI-NEXT: v_writelane_b32 v41, s66, 16
+; SI-NEXT: v_writelane_b32 v41, s67, 17
+; SI-NEXT: v_writelane_b32 v41, s68, 18
+; SI-NEXT: v_writelane_b32 v41, s69, 19
+; SI-NEXT: v_writelane_b32 v41, s70, 20
+; SI-NEXT: v_writelane_b32 v41, s71, 21
+; SI-NEXT: v_writelane_b32 v41, s80, 22
+; SI-NEXT: v_writelane_b32 v41, s81, 23
+; SI-NEXT: v_writelane_b32 v41, s82, 24
+; SI-NEXT: v_writelane_b32 v41, s83, 25
+; SI-NEXT: v_writelane_b32 v41, s84, 26
+; SI-NEXT: v_writelane_b32 v41, s85, 27
+; SI-NEXT: v_writelane_b32 v41, s86, 28
+; SI-NEXT: v_writelane_b32 v41, s87, 29
+; SI-NEXT: v_writelane_b32 v41, s96, 30
+; SI-NEXT: v_writelane_b32 v41, s97, 31
+; SI-NEXT: v_writelane_b32 v41, s98, 32
; SI-NEXT: s_mov_b32 s22, s16
+; SI-NEXT: v_writelane_b32 v41, s99, 33
+; SI-NEXT: v_writelane_b32 v41, s30, 34
+; SI-NEXT: v_writelane_b32 v41, s31, 35
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
@@ -186643,7 +186644,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s60, v27
; SI-NEXT: v_readfirstlane_b32 s61, v26
; SI-NEXT: v_readfirstlane_b32 s12, v0
-; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296
@@ -186673,6 +186673,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:256
; SI-NEXT: v_readfirstlane_b32 s6, v37
; SI-NEXT: v_readfirstlane_b32 s7, v38
+; SI-NEXT: v_readfirstlane_b32 s13, v1
; SI-NEXT: v_readfirstlane_b32 s14, v8
; SI-NEXT: v_readfirstlane_b32 s15, v9
; SI-NEXT: v_readfirstlane_b32 s40, v7
@@ -186686,7 +186687,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s78, v15
; SI-NEXT: v_readfirstlane_b32 s38, v13
; SI-NEXT: v_readfirstlane_b32 s39, v24
-; SI-NEXT: v_writelane_b32 v41, s99, 35
; SI-NEXT: v_readfirstlane_b32 s48, v25
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s99, v54
@@ -188058,6 +188058,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_lshl_b32 s17, s70, 16
; SI-NEXT: s_or_b32 s4, s4, s47
; SI-NEXT: s_or_b32 s5, s5, s17
+; SI-NEXT: v_readlane_b32 s30, v41, 34
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s18
; SI-NEXT: v_mov_b32_e32 v2, s19
@@ -188090,42 +188091,41 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v29, s7
; SI-NEXT: v_mov_b32_e32 v30, s4
; SI-NEXT: v_mov_b32_e32 v31, s5
-; SI-NEXT: v_readlane_b32 s99, v41, 35
-; SI-NEXT: v_readlane_b32 s98, v41, 34
-; SI-NEXT: v_readlane_b32 s97, v41, 33
-; SI-NEXT: v_readlane_b32 s96, v41, 32
-; SI-NEXT: v_readlane_b32 s87, v41, 31
-; SI-NEXT: v_readlane_b32 s86, v41, 30
-; SI-NEXT: v_readlane_b32 s85, v41, 29
-; SI-NEXT: v_readlane_b32 s84, v41, 28
-; SI-NEXT: v_readlane_b32 s83, v41, 27
-; SI-NEXT: v_readlane_b32 s82, v41, 26
-; SI-NEXT: v_readlane_b32 s81, v41, 25
-; SI-NEXT: v_readlane_b32 s80, v41, 24
-; SI-NEXT: v_readlane_b32 s71, v41, 23
-; SI-NEXT: v_readlane_b32 s70, v41, 22
-; SI-NEXT: v_readlane_b32 s69, v41, 21
-; SI-NEXT: v_readlane_b32 s68, v41, 20
-; SI-NEXT: v_readlane_b32 s67, v41, 19
-; SI-NEXT: v_readlane_b32 s66, v41, 18
-; SI-NEXT: v_readlane_b32 s65, v41, 17
-; SI-NEXT: v_readlane_b32 s64, v41, 16
-; SI-NEXT: v_readlane_b32 s55, v41, 15
-; SI-NEXT: v_readlane_b32 s54, v41, 14
-; SI-NEXT: v_readlane_b32 s53, v41, 13
-; SI-NEXT: v_readlane_b32 s52, v41, 12
-; SI-NEXT: v_readlane_b32 s51, v41, 11
-; SI-NEXT: v_readlane_b32 s50, v41, 10
-; SI-NEXT: v_readlane_b32 s49, v41, 9
-; SI-NEXT: v_readlane_b32 s48, v41, 8
-; SI-NEXT: v_readlane_b32 s39, v41, 7
-; SI-NEXT: v_readlane_b32 s38, v41, 6
-; SI-NEXT: v_readlane_b32 s37, v41, 5
-; SI-NEXT: v_readlane_b32 s36, v41, 4
-; SI-NEXT: v_readlane_b32 s35, v41, 3
-; SI-NEXT: v_readlane_b32 s34, v41, 2
-; SI-NEXT: v_readlane_b32 s31, v41, 1
-; SI-NEXT: v_readlane_b32 s30, v41, 0
+; SI-NEXT: v_readlane_b32 s31, v41, 35
+; SI-NEXT: v_readlane_b32 s99, v41, 33
+; SI-NEXT: v_readlane_b32 s98, v41, 32
+; SI-NEXT: v_readlane_b32 s97, v41, 31
+; SI-NEXT: v_readlane_b32 s96, v41, 30
+; SI-NEXT: v_readlane_b32 s87, v41, 29
+; SI-NEXT: v_readlane_b32 s86, v41, 28
+; SI-NEXT: v_readlane_b32 s85, v41, 27
+; SI-NEXT: v_readlane_b32 s84, v41, 26
+; SI-NEXT: v_readlane_b32 s83, v41, 25
+; SI-NEXT: v_readlane_b32 s82, v41, 24
+; SI-NEXT: v_readlane_b32 s81, v41, 23
+; SI-NEXT: v_readlane_b32 s80, v41, 22
+; SI-NEXT: v_readlane_b32 s71, v41, 21
+; SI-NEXT: v_readlane_b32 s70, v41, 20
+; SI-NEXT: v_readlane_b32 s69, v41, 19
+; SI-NEXT: v_readlane_b32 s68, v41, 18
+; SI-NEXT: v_readlane_b32 s67, v41, 17
+; SI-NEXT: v_readlane_b32 s66, v41, 16
+; SI-NEXT: v_readlane_b32 s65, v41, 15
+; SI-NEXT: v_readlane_b32 s64, v41, 14
+; SI-NEXT: v_readlane_b32 s55, v41, 13
+; SI-NEXT: v_readlane_b32 s54, v41, 12
+; SI-NEXT: v_readlane_b32 s53, v41, 11
+; SI-NEXT: v_readlane_b32 s52, v41, 10
+; SI-NEXT: v_readlane_b32 s51, v41, 9
+; SI-NEXT: v_readlane_b32 s50, v41, 8
+; SI-NEXT: v_readlane_b32 s49, v41, 7
+; SI-NEXT: v_readlane_b32 s48, v41, 6
+; SI-NEXT: v_readlane_b32 s39, v41, 5
+; SI-NEXT: v_readlane_b32 s38, v41, 4
+; SI-NEXT: v_readlane_b32 s37, v41, 3
+; SI-NEXT: v_readlane_b32 s36, v41, 2
+; SI-NEXT: v_readlane_b32 s35, v41, 1
+; SI-NEXT: v_readlane_b32 s34, v41, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
@@ -196362,78 +196362,79 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v21, s30, 0
-; SI-NEXT: v_writelane_b32 v21, s31, 1
-; SI-NEXT: v_writelane_b32 v21, s34, 2
-; SI-NEXT: v_writelane_b32 v21, s35, 3
-; SI-NEXT: v_writelane_b32 v21, s36, 4
-; SI-NEXT: v_writelane_b32 v21, s37, 5
-; SI-NEXT: v_writelane_b32 v21, s38, 6
-; SI-NEXT: v_writelane_b32 v21, s39, 7
-; SI-NEXT: v_writelane_b32 v21, s48, 8
-; SI-NEXT: v_writelane_b32 v21, s49, 9
-; SI-NEXT: v_writelane_b32 v21, s50, 10
-; SI-NEXT: v_writelane_b32 v21, s51, 11
-; SI-NEXT: v_writelane_b32 v21, s52, 12
-; SI-NEXT: v_writelane_b32 v21, s53, 13
-; SI-NEXT: v_writelane_b32 v21, s54, 14
-; SI-NEXT: v_writelane_b32 v21, s55, 15
-; SI-NEXT: v_writelane_b32 v21, s64, 16
-; SI-NEXT: v_writelane_b32 v21, s65, 17
-; SI-NEXT: v_writelane_b32 v21, s66, 18
-; SI-NEXT: v_writelane_b32 v21, s67, 19
-; SI-NEXT: v_writelane_b32 v21, s68, 20
+; SI-NEXT: v_writelane_b32 v21, s34, 0
+; SI-NEXT: v_writelane_b32 v21, s35, 1
+; SI-NEXT: v_writelane_b32 v21, s36, 2
+; SI-NEXT: v_writelane_b32 v21, s37, 3
+; SI-NEXT: v_writelane_b32 v21, s38, 4
+; SI-NEXT: v_writelane_b32 v21, s39, 5
+; SI-NEXT: v_writelane_b32 v21, s48, 6
+; SI-NEXT: v_writelane_b32 v21, s49, 7
+; SI-NEXT: v_writelane_b32 v21, s50, 8
+; SI-NEXT: v_writelane_b32 v21, s51, 9
+; SI-NEXT: v_writelane_b32 v21, s52, 10
+; SI-NEXT: v_writelane_b32 v21, s53, 11
+; SI-NEXT: v_writelane_b32 v21, s54, 12
+; SI-NEXT: v_writelane_b32 v21, s55, 13
+; SI-NEXT: v_writelane_b32 v21, s64, 14
+; SI-NEXT: v_writelane_b32 v21, s65, 15
+; SI-NEXT: v_writelane_b32 v21, s66, 16
+; SI-NEXT: v_writelane_b32 v21, s67, 17
+; SI-NEXT: v_writelane_b32 v21, s68, 18
+; SI-NEXT: v_writelane_b32 v21, s69, 19
+; SI-NEXT: v_writelane_b32 v21, s70, 20
+; SI-NEXT: v_writelane_b32 v21, s71, 21
; SI-NEXT: v_readfirstlane_b32 s6, v12
; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v21, s69, 21
+; SI-NEXT: v_writelane_b32 v21, s80, 22
; SI-NEXT: v_readfirstlane_b32 s7, v1
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_writelane_b32 v22, s6, 0
-; SI-NEXT: v_writelane_b32 v21, s70, 22
+; SI-NEXT: v_writelane_b32 v21, s81, 23
; SI-NEXT: v_readfirstlane_b32 s8, v4
; SI-NEXT: v_writelane_b32 v22, s7, 1
-; SI-NEXT: v_writelane_b32 v21, s71, 23
+; SI-NEXT: v_writelane_b32 v21, s82, 24
; SI-NEXT: v_readfirstlane_b32 s9, v11
; SI-NEXT: v_writelane_b32 v22, s8, 2
-; SI-NEXT: v_writelane_b32 v21, s80, 24
+; SI-NEXT: v_writelane_b32 v21, s83, 25
; SI-NEXT: v_readfirstlane_b32 s48, v14
; SI-NEXT: v_writelane_b32 v22, s9, 3
-; SI-NEXT: v_writelane_b32 v21, s81, 25
+; SI-NEXT: v_writelane_b32 v21, s84, 26
; SI-NEXT: v_readfirstlane_b32 s10, v3
; SI-NEXT: v_writelane_b32 v22, s48, 4
-; SI-NEXT: v_writelane_b32 v21, s82, 26
+; SI-NEXT: v_writelane_b32 v21, s85, 27
; SI-NEXT: v_readfirstlane_b32 s50, v13
; SI-NEXT: v_writelane_b32 v22, s10, 5
-; SI-NEXT: v_writelane_b32 v21, s83, 27
+; SI-NEXT: v_writelane_b32 v21, s86, 28
; SI-NEXT: v_readfirstlane_b32 s52, v16
; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1
; SI-NEXT: v_writelane_b32 v22, s50, 6
-; SI-NEXT: v_writelane_b32 v21, s84, 28
+; SI-NEXT: v_writelane_b32 v21, s87, 29
; SI-NEXT: v_readfirstlane_b32 s11, v20
; SI-NEXT: v_writelane_b32 v22, s52, 7
-; SI-NEXT: v_writelane_b32 v21, s85, 29
+; SI-NEXT: v_writelane_b32 v21, s96, 30
; SI-NEXT: v_readfirstlane_b32 s54, v15
; SI-NEXT: v_writelane_b32 v22, s11, 8
-; SI-NEXT: v_writelane_b32 v21, s86, 30
+; SI-NEXT: v_writelane_b32 v21, s97, 31
; SI-NEXT: v_readfirstlane_b32 s64, v18
; SI-NEXT: v_writelane_b32 v22, s54, 9
-; SI-NEXT: v_writelane_b32 v21, s87, 31
+; SI-NEXT: v_writelane_b32 v21, s98, 32
; SI-NEXT: v_readfirstlane_b32 s66, v17
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; SI-NEXT: v_writelane_b32 v22, s64, 10
-; SI-NEXT: v_writelane_b32 v21, s96, 32
+; SI-NEXT: v_writelane_b32 v21, s99, 33
; SI-NEXT: v_readfirstlane_b32 s81, v5
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_readfirstlane_b32 s12, v4
; SI-NEXT: v_writelane_b32 v22, s66, 11
-; SI-NEXT: v_writelane_b32 v21, s97, 33
+; SI-NEXT: v_writelane_b32 v21, s30, 34
; SI-NEXT: v_readfirstlane_b32 s85, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_readfirstlane_b32 s13, v5
; SI-NEXT: v_writelane_b32 v22, s12, 12
-; SI-NEXT: v_writelane_b32 v21, s98, 34
+; SI-NEXT: v_writelane_b32 v21, s31, 35
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
@@ -196467,7 +196468,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: v_readfirstlane_b32 s14, v7
; SI-NEXT: v_writelane_b32 v22, s13, 13
-; SI-NEXT: v_writelane_b32 v21, s99, 35
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s99, v2
; SI-NEXT: v_readfirstlane_b32 s76, v18
@@ -197660,6 +197660,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v21, 34
; SI-NEXT: v_readlane_b32 s19, v22, 55
; SI-NEXT: v_readlane_b32 s17, v22, 61
; SI-NEXT: v_readlane_b32 s15, v23, 3
@@ -197667,42 +197668,41 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_readlane_b32 s11, v23, 15
; SI-NEXT: v_readlane_b32 s9, v23, 19
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s99, v21, 35
-; SI-NEXT: v_readlane_b32 s98, v21, 34
-; SI-NEXT: v_readlane_b32 s97, v21, 33
-; SI-NEXT: v_readlane_b32 s96, v21, 32
-; SI-NEXT: v_readlane_b32 s87, v21, 31
-; SI-NEXT: v_readlane_b32 s86, v21, 30
-; SI-NEXT: v_readlane_b32 s85, v21, 29
-; SI-NEXT: v_readlane_b32 s84, v21, 28
-; SI-NEXT: v_readlane_b32 s83, v21, 27
-; SI-NEXT: v_readlane_b32 s82, v21, 26
-; SI-NEXT: v_readlane_b32 s81, v21, 25
-; SI-NEXT: v_readlane_b32 s80, v21, 24
-; SI-NEXT: v_readlane_b32 s71, v21, 23
-; SI-NEXT: v_readlane_b32 s70, v21, 22
-; SI-NEXT: v_readlane_b32 s69, v21, 21
-; SI-NEXT: v_readlane_b32 s68, v21, 20
-; SI-NEXT: v_readlane_b32 s67, v21, 19
-; SI-NEXT: v_readlane_b32 s66, v21, 18
-; SI-NEXT: v_readlane_b32 s65, v21, 17
-; SI-NEXT: v_readlane_b32 s64, v21, 16
-; SI-NEXT: v_readlane_b32 s55, v21, 15
-; SI-NEXT: v_readlane_b32 s54, v21, 14
-; SI-NEXT: v_readlane_b32 s53, v21, 13
-; SI-NEXT: v_readlane_b32 s52, v21, 12
-; SI-NEXT: v_readlane_b32 s51, v21, 11
-; SI-NEXT: v_readlane_b32 s50, v21, 10
-; SI-NEXT: v_readlane_b32 s49, v21, 9
-; SI-NEXT: v_readlane_b32 s48, v21, 8
-; SI-NEXT: v_readlane_b32 s39, v21, 7
-; SI-NEXT: v_readlane_b32 s38, v21, 6
-; SI-NEXT: v_readlane_b32 s37, v21, 5
-; SI-NEXT: v_readlane_b32 s36, v21, 4
-; SI-NEXT: v_readlane_b32 s35, v21, 3
-; SI-NEXT: v_readlane_b32 s34, v21, 2
-; SI-NEXT: v_readlane_b32 s31, v21, 1
-; SI-NEXT: v_readlane_b32 s30, v21, 0
+; SI-NEXT: v_readlane_b32 s31, v21, 35
+; SI-NEXT: v_readlane_b32 s99, v21, 33
+; SI-NEXT: v_readlane_b32 s98, v21, 32
+; SI-NEXT: v_readlane_b32 s97, v21, 31
+; SI-NEXT: v_readlane_b32 s96, v21, 30
+; SI-NEXT: v_readlane_b32 s87, v21, 29
+; SI-NEXT: v_readlane_b32 s86, v21, 28
+; SI-NEXT: v_readlane_b32 s85, v21, 27
+; SI-NEXT: v_readlane_b32 s84, v21, 26
+; SI-NEXT: v_readlane_b32 s83, v21, 25
+; SI-NEXT: v_readlane_b32 s82, v21, 24
+; SI-NEXT: v_readlane_b32 s81, v21, 23
+; SI-NEXT: v_readlane_b32 s80, v21, 22
+; SI-NEXT: v_readlane_b32 s71, v21, 21
+; SI-NEXT: v_readlane_b32 s70, v21, 20
+; SI-NEXT: v_readlane_b32 s69, v21, 19
+; SI-NEXT: v_readlane_b32 s68, v21, 18
+; SI-NEXT: v_readlane_b32 s67, v21, 17
+; SI-NEXT: v_readlane_b32 s66, v21, 16
+; SI-NEXT: v_readlane_b32 s65, v21, 15
+; SI-NEXT: v_readlane_b32 s64, v21, 14
+; SI-NEXT: v_readlane_b32 s55, v21, 13
+; SI-NEXT: v_readlane_b32 s54, v21, 12
+; SI-NEXT: v_readlane_b32 s53, v21, 11
+; SI-NEXT: v_readlane_b32 s52, v21, 10
+; SI-NEXT: v_readlane_b32 s51, v21, 9
+; SI-NEXT: v_readlane_b32 s50, v21, 8
+; SI-NEXT: v_readlane_b32 s49, v21, 7
+; SI-NEXT: v_readlane_b32 s48, v21, 6
+; SI-NEXT: v_readlane_b32 s39, v21, 5
+; SI-NEXT: v_readlane_b32 s38, v21, 4
+; SI-NEXT: v_readlane_b32 s37, v21, 3
+; SI-NEXT: v_readlane_b32 s36, v21, 2
+; SI-NEXT: v_readlane_b32 s35, v21, 1
+; SI-NEXT: v_readlane_b32 s34, v21, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -197917,36 +197917,36 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v32, s30, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 1
-; VI-NEXT: v_writelane_b32 v32, s34, 2
-; VI-NEXT: v_writelane_b32 v32, s35, 3
-; VI-NEXT: v_writelane_b32 v32, s36, 4
-; VI-NEXT: v_writelane_b32 v32, s37, 5
-; VI-NEXT: v_writelane_b32 v32, s38, 6
-; VI-NEXT: v_writelane_b32 v32, s39, 7
-; VI-NEXT: v_writelane_b32 v32, s48, 8
-; VI-NEXT: v_writelane_b32 v32, s49, 9
-; VI-NEXT: v_writelane_b32 v32, s50, 10
-; VI-NEXT: v_writelane_b32 v32, s51, 11
-; VI-NEXT: v_writelane_b32 v32, s52, 12
-; VI-NEXT: v_writelane_b32 v32, s53, 13
-; VI-NEXT: v_writelane_b32 v32, s54, 14
-; VI-NEXT: v_writelane_b32 v32, s55, 15
-; VI-NEXT: v_writelane_b32 v32, s64, 16
-; VI-NEXT: v_writelane_b32 v32, s65, 17
-; VI-NEXT: v_writelane_b32 v32, s66, 18
-; VI-NEXT: v_writelane_b32 v32, s67, 19
-; VI-NEXT: v_writelane_b32 v32, s68, 20
-; VI-NEXT: v_writelane_b32 v32, s69, 21
-; VI-NEXT: v_writelane_b32 v32, s70, 22
-; VI-NEXT: v_writelane_b32 v32, s71, 23
-; VI-NEXT: v_writelane_b32 v32, s80, 24
-; VI-NEXT: v_writelane_b32 v32, s81, 25
-; VI-NEXT: v_writelane_b32 v32, s82, 26
-; VI-NEXT: v_writelane_b32 v32, s83, 27
-; VI-NEXT: v_writelane_b32 v32, s84, 28
-; VI-NEXT: v_writelane_b32 v32, s85, 29
+; VI-NEXT: v_writelane_b32 v32, s34, 0
+; VI-NEXT: v_writelane_b32 v32, s35, 1
+; VI-NEXT: v_writelane_b32 v32, s36, 2
+; VI-NEXT: v_writelane_b32 v32, s37, 3
+; VI-NEXT: v_writelane_b32 v32, s38, 4
+; VI-NEXT: v_writelane_b32 v32, s39, 5
+; VI-NEXT: v_writelane_b32 v32, s48, 6
+; VI-NEXT: v_writelane_b32 v32, s49, 7
+; VI-NEXT: v_writelane_b32 v32, s50, 8
+; VI-NEXT: v_writelane_b32 v32, s51, 9
+; VI-NEXT: v_writelane_b32 v32, s52, 10
+; VI-NEXT: v_writelane_b32 v32, s53, 11
+; VI-NEXT: v_writelane_b32 v32, s54, 12
+; VI-NEXT: v_writelane_b32 v32, s55, 13
+; VI-NEXT: v_writelane_b32 v32, s64, 14
+; VI-NEXT: v_writelane_b32 v32, s65, 15
+; VI-NEXT: v_writelane_b32 v32, s66, 16
+; VI-NEXT: v_writelane_b32 v32, s67, 17
+; VI-NEXT: v_writelane_b32 v32, s68, 18
+; VI-NEXT: v_writelane_b32 v32, s69, 19
+; VI-NEXT: v_writelane_b32 v32, s70, 20
+; VI-NEXT: v_writelane_b32 v32, s71, 21
+; VI-NEXT: v_writelane_b32 v32, s80, 22
+; VI-NEXT: v_writelane_b32 v32, s81, 23
+; VI-NEXT: v_writelane_b32 v32, s82, 24
+; VI-NEXT: v_writelane_b32 v32, s83, 25
+; VI-NEXT: v_writelane_b32 v32, s84, 26
+; VI-NEXT: v_writelane_b32 v32, s85, 27
+; VI-NEXT: v_writelane_b32 v32, s86, 28
+; VI-NEXT: v_writelane_b32 v32, s87, 29
; VI-NEXT: v_readfirstlane_b32 s40, v3
; VI-NEXT: v_mov_b32_e32 v3, s16
; VI-NEXT: v_readfirstlane_b32 s41, v4
@@ -197976,7 +197976,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s7, v16
; VI-NEXT: v_mov_b32_e32 v16, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; VI-NEXT: v_writelane_b32 v32, s86, 30
+; VI-NEXT: v_writelane_b32 v32, s30, 30
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: v_readfirstlane_b32 s5, v18
; VI-NEXT: v_readfirstlane_b32 s44, v3
@@ -197996,7 +197996,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s18, v1
; VI-NEXT: s_and_b64 s[46:47], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s19, v2
-; VI-NEXT: v_writelane_b32 v32, s87, 31
+; VI-NEXT: v_writelane_b32 v32, s31, 31
; VI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB99_4
; VI-NEXT: ; %bb.1: ; %cmp.false
@@ -198784,39 +198784,39 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: v_readlane_b32 s30, v32, 30
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s87, v32, 31
-; VI-NEXT: v_readlane_b32 s86, v32, 30
-; VI-NEXT: v_readlane_b32 s85, v32, 29
-; VI-NEXT: v_readlane_b32 s84, v32, 28
-; VI-NEXT: v_readlane_b32 s83, v32, 27
-; VI-NEXT: v_readlane_b32 s82, v32, 26
-; VI-NEXT: v_readlane_b32 s81, v32, 25
-; VI-NEXT: v_readlane_b32 s80, v32, 24
-; VI-NEXT: v_readlane_b32 s71, v32, 23
-; VI-NEXT: v_readlane_b32 s70, v32, 22
-; VI-NEXT: v_readlane_b32 s69, v32, 21
-; VI-NEXT: v_readlane_b32 s68, v32, 20
-; VI-NEXT: v_readlane_b32 s67, v32, 19
-; VI-NEXT: v_readlane_b32 s66, v32, 18
-; VI-NEXT: v_readlane_b32 s65, v32, 17
-; VI-NEXT: v_readlane_b32 s64, v32, 16
-; VI-NEXT: v_readlane_b32 s55, v32, 15
-; VI-NEXT: v_readlane_b32 s54, v32, 14
-; VI-NEXT: v_readlane_b32 s53, v32, 13
-; VI-NEXT: v_readlane_b32 s52, v32, 12
-; VI-NEXT: v_readlane_b32 s51, v32, 11
-; VI-NEXT: v_readlane_b32 s50, v32, 10
-; VI-NEXT: v_readlane_b32 s49, v32, 9
-; VI-NEXT: v_readlane_b32 s48, v32, 8
-; VI-NEXT: v_readlane_b32 s39, v32, 7
-; VI-NEXT: v_readlane_b32 s38, v32, 6
-; VI-NEXT: v_readlane_b32 s37, v32, 5
-; VI-NEXT: v_readlane_b32 s36, v32, 4
-; VI-NEXT: v_readlane_b32 s35, v32, 3
-; VI-NEXT: v_readlane_b32 s34, v32, 2
-; VI-NEXT: v_readlane_b32 s31, v32, 1
-; VI-NEXT: v_readlane_b32 s30, v32, 0
+; VI-NEXT: v_readlane_b32 s31, v32, 31
+; VI-NEXT: v_readlane_b32 s87, v32, 29
+; VI-NEXT: v_readlane_b32 s86, v32, 28
+; VI-NEXT: v_readlane_b32 s85, v32, 27
+; VI-NEXT: v_readlane_b32 s84, v32, 26
+; VI-NEXT: v_readlane_b32 s83, v32, 25
+; VI-NEXT: v_readlane_b32 s82, v32, 24
+; VI-NEXT: v_readlane_b32 s81, v32, 23
+; VI-NEXT: v_readlane_b32 s80, v32, 22
+; VI-NEXT: v_readlane_b32 s71, v32, 21
+; VI-NEXT: v_readlane_b32 s70, v32, 20
+; VI-NEXT: v_readlane_b32 s69, v32, 19
+; VI-NEXT: v_readlane_b32 s68, v32, 18
+; VI-NEXT: v_readlane_b32 s67, v32, 17
+; VI-NEXT: v_readlane_b32 s66, v32, 16
+; VI-NEXT: v_readlane_b32 s65, v32, 15
+; VI-NEXT: v_readlane_b32 s64, v32, 14
+; VI-NEXT: v_readlane_b32 s55, v32, 13
+; VI-NEXT: v_readlane_b32 s54, v32, 12
+; VI-NEXT: v_readlane_b32 s53, v32, 11
+; VI-NEXT: v_readlane_b32 s52, v32, 10
+; VI-NEXT: v_readlane_b32 s51, v32, 9
+; VI-NEXT: v_readlane_b32 s50, v32, 8
+; VI-NEXT: v_readlane_b32 s49, v32, 7
+; VI-NEXT: v_readlane_b32 s48, v32, 6
+; VI-NEXT: v_readlane_b32 s39, v32, 5
+; VI-NEXT: v_readlane_b32 s38, v32, 4
+; VI-NEXT: v_readlane_b32 s37, v32, 3
+; VI-NEXT: v_readlane_b32 s36, v32, 2
+; VI-NEXT: v_readlane_b32 s35, v32, 1
+; VI-NEXT: v_readlane_b32 s34, v32, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -199896,43 +199896,43 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v74, s30, 0
-; GFX11-NEXT: v_writelane_b32 v75, s96, 0
+; GFX11-NEXT: v_writelane_b32 v74, s34, 0
+; GFX11-NEXT: v_writelane_b32 v75, s98, 0
; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT: v_writelane_b32 v74, s31, 1
-; GFX11-NEXT: v_writelane_b32 v75, s97, 1
+; GFX11-NEXT: v_writelane_b32 v74, s35, 1
+; GFX11-NEXT: v_writelane_b32 v75, s99, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_readfirstlane_b32 s40, v16
; GFX11-NEXT: v_readfirstlane_b32 s41, v17
; GFX11-NEXT: v_readfirstlane_b32 s28, v1
-; GFX11-NEXT: v_writelane_b32 v74, s34, 2
-; GFX11-NEXT: v_writelane_b32 v75, s98, 2
+; GFX11-NEXT: v_writelane_b32 v74, s36, 2
+; GFX11-NEXT: v_writelane_b32 v75, s100, 2
; GFX11-NEXT: v_readfirstlane_b32 s29, v2
; GFX11-NEXT: v_readfirstlane_b32 s14, v3
; GFX11-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-NEXT: v_writelane_b32 v74, s35, 3
-; GFX11-NEXT: v_writelane_b32 v75, s99, 3
+; GFX11-NEXT: v_writelane_b32 v74, s37, 3
+; GFX11-NEXT: v_writelane_b32 v75, s101, 3
; GFX11-NEXT: v_readfirstlane_b32 s12, v5
; GFX11-NEXT: v_readfirstlane_b32 s13, v6
; GFX11-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-NEXT: v_writelane_b32 v74, s36, 4
-; GFX11-NEXT: v_writelane_b32 v75, s100, 4
+; GFX11-NEXT: v_writelane_b32 v74, s38, 4
+; GFX11-NEXT: v_writelane_b32 v75, s102, 4
; GFX11-NEXT: v_readfirstlane_b32 s11, v8
; GFX11-NEXT: v_readfirstlane_b32 s8, v9
; GFX11-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-NEXT: v_writelane_b32 v74, s37, 5
-; GFX11-NEXT: v_writelane_b32 v75, s101, 5
+; GFX11-NEXT: v_writelane_b32 v74, s39, 5
+; GFX11-NEXT: v_writelane_b32 v75, s103, 5
; GFX11-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-NEXT: v_readfirstlane_b32 s7, v12
; GFX11-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-NEXT: v_writelane_b32 v74, s38, 6
-; GFX11-NEXT: v_writelane_b32 v75, s102, 6
+; GFX11-NEXT: v_writelane_b32 v74, s48, 6
+; GFX11-NEXT: v_writelane_b32 v75, s104, 6
; GFX11-NEXT: v_readfirstlane_b32 s5, v14
; GFX11-NEXT: s_mov_b32 s99, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: v_writelane_b32 v74, s39, 7
-; GFX11-NEXT: v_writelane_b32 v75, s103, 7
+; GFX11-NEXT: v_writelane_b32 v74, s49, 7
+; GFX11-NEXT: v_writelane_b32 v75, s30, 7
; GFX11-NEXT: s_clause 0x11 ; 72-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:64
@@ -199954,31 +199954,31 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v73, s32
; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane
; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane
-; GFX11-NEXT: v_writelane_b32 v74, s48, 8
-; GFX11-NEXT: v_writelane_b32 v75, s104, 8
-; GFX11-NEXT: v_writelane_b32 v74, s49, 9
-; GFX11-NEXT: v_writelane_b32 v74, s50, 10
-; GFX11-NEXT: v_writelane_b32 v74, s51, 11
-; GFX11-NEXT: v_writelane_b32 v74, s52, 12
-; GFX11-NEXT: v_writelane_b32 v74, s53, 13
-; GFX11-NEXT: v_writelane_b32 v74, s54, 14
-; GFX11-NEXT: v_writelane_b32 v74, s55, 15
-; GFX11-NEXT: v_writelane_b32 v74, s64, 16
-; GFX11-NEXT: v_writelane_b32 v74, s65, 17
-; GFX11-NEXT: v_writelane_b32 v74, s66, 18
-; GFX11-NEXT: v_writelane_b32 v74, s67, 19
-; GFX11-NEXT: v_writelane_b32 v74, s68, 20
-; GFX11-NEXT: v_writelane_b32 v74, s69, 21
-; GFX11-NEXT: v_writelane_b32 v74, s70, 22
-; GFX11-NEXT: v_writelane_b32 v74, s71, 23
-; GFX11-NEXT: v_writelane_b32 v74, s80, 24
-; GFX11-NEXT: v_writelane_b32 v74, s81, 25
-; GFX11-NEXT: v_writelane_b32 v74, s82, 26
-; GFX11-NEXT: v_writelane_b32 v74, s83, 27
-; GFX11-NEXT: v_writelane_b32 v74, s84, 28
-; GFX11-NEXT: v_writelane_b32 v74, s85, 29
-; GFX11-NEXT: v_writelane_b32 v74, s86, 30
-; GFX11-NEXT: v_writelane_b32 v74, s87, 31
+; GFX11-NEXT: v_writelane_b32 v74, s50, 8
+; GFX11-NEXT: v_writelane_b32 v75, s31, 8
+; GFX11-NEXT: v_writelane_b32 v74, s51, 9
+; GFX11-NEXT: v_writelane_b32 v74, s52, 10
+; GFX11-NEXT: v_writelane_b32 v74, s53, 11
+; GFX11-NEXT: v_writelane_b32 v74, s54, 12
+; GFX11-NEXT: v_writelane_b32 v74, s55, 13
+; GFX11-NEXT: v_writelane_b32 v74, s64, 14
+; GFX11-NEXT: v_writelane_b32 v74, s65, 15
+; GFX11-NEXT: v_writelane_b32 v74, s66, 16
+; GFX11-NEXT: v_writelane_b32 v74, s67, 17
+; GFX11-NEXT: v_writelane_b32 v74, s68, 18
+; GFX11-NEXT: v_writelane_b32 v74, s69, 19
+; GFX11-NEXT: v_writelane_b32 v74, s70, 20
+; GFX11-NEXT: v_writelane_b32 v74, s71, 21
+; GFX11-NEXT: v_writelane_b32 v74, s80, 22
+; GFX11-NEXT: v_writelane_b32 v74, s81, 23
+; GFX11-NEXT: v_writelane_b32 v74, s82, 24
+; GFX11-NEXT: v_writelane_b32 v74, s83, 25
+; GFX11-NEXT: v_writelane_b32 v74, s84, 26
+; GFX11-NEXT: v_writelane_b32 v74, s85, 27
+; GFX11-NEXT: v_writelane_b32 v74, s86, 28
+; GFX11-NEXT: v_writelane_b32 v74, s87, 29
+; GFX11-NEXT: v_writelane_b32 v74, s96, 30
+; GFX11-NEXT: v_writelane_b32 v74, s97, 31
; GFX11-NEXT: s_cbranch_scc0 .LBB99_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 16
@@ -200706,47 +200706,47 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:68
-; GFX11-NEXT: v_readlane_b32 s104, v75, 8
-; GFX11-NEXT: v_readlane_b32 s103, v75, 7
-; GFX11-NEXT: v_readlane_b32 s102, v75, 6
-; GFX11-NEXT: v_readlane_b32 s101, v75, 5
-; GFX11-NEXT: v_readlane_b32 s100, v75, 4
-; GFX11-NEXT: v_readlane_b32 s99, v75, 3
-; GFX11-NEXT: v_readlane_b32 s98, v75, 2
-; GFX11-NEXT: v_readlane_b32 s97, v75, 1
-; GFX11-NEXT: v_readlane_b32 s96, v75, 0
-; GFX11-NEXT: v_readlane_b32 s87, v74, 31
-; GFX11-NEXT: v_readlane_b32 s86, v74, 30
-; GFX11-NEXT: v_readlane_b32 s85, v74, 29
-; GFX11-NEXT: v_readlane_b32 s84, v74, 28
-; GFX11-NEXT: v_readlane_b32 s83, v74, 27
-; GFX11-NEXT: v_readlane_b32 s82, v74, 26
-; GFX11-NEXT: v_readlane_b32 s81, v74, 25
-; GFX11-NEXT: v_readlane_b32 s80, v74, 24
-; GFX11-NEXT: v_readlane_b32 s71, v74, 23
-; GFX11-NEXT: v_readlane_b32 s70, v74, 22
-; GFX11-NEXT: v_readlane_b32 s69, v74, 21
-; GFX11-NEXT: v_readlane_b32 s68, v74, 20
-; GFX11-NEXT: v_readlane_b32 s67, v74, 19
-; GFX11-NEXT: v_readlane_b32 s66, v74, 18
-; GFX11-NEXT: v_readlane_b32 s65, v74, 17
-; GFX11-NEXT: v_readlane_b32 s64, v74, 16
-; GFX11-NEXT: v_readlane_b32 s55, v74, 15
-; GFX11-NEXT: v_readlane_b32 s54, v74, 14
-; GFX11-NEXT: v_readlane_b32 s53, v74, 13
-; GFX11-NEXT: v_readlane_b32 s52, v74, 12
-; GFX11-NEXT: v_readlane_b32 s51, v74, 11
-; GFX11-NEXT: v_readlane_b32 s50, v74, 10
-; GFX11-NEXT: v_readlane_b32 s49, v74, 9
-; GFX11-NEXT: v_readlane_b32 s48, v74, 8
-; GFX11-NEXT: v_readlane_b32 s39, v74, 7
-; GFX11-NEXT: v_readlane_b32 s38, v74, 6
-; GFX11-NEXT: v_readlane_b32 s37, v74, 5
-; GFX11-NEXT: v_readlane_b32 s36, v74, 4
-; GFX11-NEXT: v_readlane_b32 s35, v74, 3
-; GFX11-NEXT: v_readlane_b32 s34, v74, 2
-; GFX11-NEXT: v_readlane_b32 s31, v74, 1
-; GFX11-NEXT: v_readlane_b32 s30, v74, 0
+; GFX11-NEXT: v_readlane_b32 s30, v75, 7
+; GFX11-NEXT: v_readlane_b32 s31, v75, 8
+; GFX11-NEXT: v_readlane_b32 s104, v75, 6
+; GFX11-NEXT: v_readlane_b32 s103, v75, 5
+; GFX11-NEXT: v_readlane_b32 s102, v75, 4
+; GFX11-NEXT: v_readlane_b32 s101, v75, 3
+; GFX11-NEXT: v_readlane_b32 s100, v75, 2
+; GFX11-NEXT: v_readlane_b32 s99, v75, 1
+; GFX11-NEXT: v_readlane_b32 s98, v75, 0
+; GFX11-NEXT: v_readlane_b32 s97, v74, 31
+; GFX11-NEXT: v_readlane_b32 s96, v74, 30
+; GFX11-NEXT: v_readlane_b32 s87, v74, 29
+; GFX11-NEXT: v_readlane_b32 s86, v74, 28
+; GFX11-NEXT: v_readlane_b32 s85, v74, 27
+; GFX11-NEXT: v_readlane_b32 s84, v74, 26
+; GFX11-NEXT: v_readlane_b32 s83, v74, 25
+; GFX11-NEXT: v_readlane_b32 s82, v74, 24
+; GFX11-NEXT: v_readlane_b32 s81, v74, 23
+; GFX11-NEXT: v_readlane_b32 s80, v74, 22
+; GFX11-NEXT: v_readlane_b32 s71, v74, 21
+; GFX11-NEXT: v_readlane_b32 s70, v74, 20
+; GFX11-NEXT: v_readlane_b32 s69, v74, 19
+; GFX11-NEXT: v_readlane_b32 s68, v74, 18
+; GFX11-NEXT: v_readlane_b32 s67, v74, 17
+; GFX11-NEXT: v_readlane_b32 s66, v74, 16
+; GFX11-NEXT: v_readlane_b32 s65, v74, 15
+; GFX11-NEXT: v_readlane_b32 s64, v74, 14
+; GFX11-NEXT: v_readlane_b32 s55, v74, 13
+; GFX11-NEXT: v_readlane_b32 s54, v74, 12
+; GFX11-NEXT: v_readlane_b32 s53, v74, 11
+; GFX11-NEXT: v_readlane_b32 s52, v74, 10
+; GFX11-NEXT: v_readlane_b32 s51, v74, 9
+; GFX11-NEXT: v_readlane_b32 s50, v74, 8
+; GFX11-NEXT: v_readlane_b32 s49, v74, 7
+; GFX11-NEXT: v_readlane_b32 s48, v74, 6
+; GFX11-NEXT: v_readlane_b32 s39, v74, 5
+; GFX11-NEXT: v_readlane_b32 s38, v74, 4
+; GFX11-NEXT: v_readlane_b32 s37, v74, 3
+; GFX11-NEXT: v_readlane_b32 s36, v74, 2
+; GFX11-NEXT: v_readlane_b32 s35, v74, 1
+; GFX11-NEXT: v_readlane_b32 s34, v74, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72
@@ -219041,75 +219041,77 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v33, s30, 0
-; SI-NEXT: v_writelane_b32 v33, s31, 1
-; SI-NEXT: v_writelane_b32 v33, s34, 2
-; SI-NEXT: v_writelane_b32 v33, s35, 3
-; SI-NEXT: v_writelane_b32 v33, s36, 4
-; SI-NEXT: v_writelane_b32 v33, s37, 5
-; SI-NEXT: v_writelane_b32 v33, s38, 6
-; SI-NEXT: v_writelane_b32 v33, s39, 7
-; SI-NEXT: v_writelane_b32 v33, s48, 8
-; SI-NEXT: v_writelane_b32 v33, s49, 9
-; SI-NEXT: v_writelane_b32 v33, s50, 10
-; SI-NEXT: v_writelane_b32 v33, s51, 11
-; SI-NEXT: v_writelane_b32 v33, s52, 12
-; SI-NEXT: v_writelane_b32 v33, s53, 13
-; SI-NEXT: v_writelane_b32 v33, s54, 14
-; SI-NEXT: v_writelane_b32 v33, s55, 15
-; SI-NEXT: v_writelane_b32 v33, s64, 16
-; SI-NEXT: v_writelane_b32 v33, s65, 17
+; SI-NEXT: v_writelane_b32 v33, s34, 0
+; SI-NEXT: v_writelane_b32 v33, s35, 1
+; SI-NEXT: v_writelane_b32 v33, s36, 2
+; SI-NEXT: v_writelane_b32 v33, s37, 3
+; SI-NEXT: v_writelane_b32 v33, s38, 4
+; SI-NEXT: v_writelane_b32 v33, s39, 5
+; SI-NEXT: v_writelane_b32 v33, s48, 6
+; SI-NEXT: v_writelane_b32 v33, s49, 7
+; SI-NEXT: v_writelane_b32 v33, s50, 8
+; SI-NEXT: v_writelane_b32 v33, s51, 9
+; SI-NEXT: v_writelane_b32 v33, s52, 10
+; SI-NEXT: v_writelane_b32 v33, s53, 11
+; SI-NEXT: v_writelane_b32 v33, s54, 12
+; SI-NEXT: v_writelane_b32 v33, s55, 13
+; SI-NEXT: v_writelane_b32 v33, s64, 14
+; SI-NEXT: v_writelane_b32 v33, s65, 15
+; SI-NEXT: v_writelane_b32 v33, s66, 16
+; SI-NEXT: v_writelane_b32 v33, s67, 17
+; SI-NEXT: v_writelane_b32 v33, s68, 18
+; SI-NEXT: v_writelane_b32 v33, s69, 19
+; SI-NEXT: v_writelane_b32 v33, s70, 20
+; SI-NEXT: v_writelane_b32 v33, s71, 21
+; SI-NEXT: v_writelane_b32 v33, s80, 22
+; SI-NEXT: v_writelane_b32 v33, s81, 23
+; SI-NEXT: v_writelane_b32 v33, s82, 24
+; SI-NEXT: v_writelane_b32 v33, s83, 25
; SI-NEXT: s_lshr_b32 s6, s16, 16
; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v33, s66, 18
+; SI-NEXT: v_writelane_b32 v33, s84, 26
; SI-NEXT: s_lshr_b32 s7, s17, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v34, s6, 0
-; SI-NEXT: v_writelane_b32 v33, s67, 19
+; SI-NEXT: v_writelane_b32 v33, s85, 27
; SI-NEXT: s_lshr_b32 s56, s18, 16
; SI-NEXT: v_writelane_b32 v34, s7, 1
-; SI-NEXT: v_writelane_b32 v33, s68, 20
+; SI-NEXT: v_writelane_b32 v33, s86, 28
; SI-NEXT: s_lshr_b32 s57, s19, 16
; SI-NEXT: v_writelane_b32 v34, s56, 2
-; SI-NEXT: v_writelane_b32 v33, s69, 21
+; SI-NEXT: v_writelane_b32 v33, s87, 29
; SI-NEXT: s_lshr_b32 s90, s20, 16
; SI-NEXT: v_writelane_b32 v34, s57, 3
-; SI-NEXT: v_writelane_b32 v33, s70, 22
+; SI-NEXT: v_writelane_b32 v33, s96, 30
; SI-NEXT: s_lshr_b32 s91, s21, 16
; SI-NEXT: v_writelane_b32 v34, s90, 4
-; SI-NEXT: v_writelane_b32 v33, s71, 23
+; SI-NEXT: v_writelane_b32 v33, s97, 31
; SI-NEXT: s_lshr_b32 s92, s22, 16
; SI-NEXT: v_writelane_b32 v34, s91, 5
-; SI-NEXT: v_writelane_b32 v33, s80, 24
+; SI-NEXT: v_writelane_b32 v33, s98, 32
; SI-NEXT: s_lshr_b32 s93, s23, 16
; SI-NEXT: v_writelane_b32 v34, s92, 6
-; SI-NEXT: v_writelane_b32 v33, s81, 25
+; SI-NEXT: v_writelane_b32 v33, s99, 33
; SI-NEXT: s_lshr_b32 s94, s24, 16
; SI-NEXT: v_writelane_b32 v34, s93, 7
-; SI-NEXT: v_writelane_b32 v33, s82, 26
+; SI-NEXT: v_writelane_b32 v33, s30, 34
; SI-NEXT: s_lshr_b32 s95, s25, 16
; SI-NEXT: v_writelane_b32 v34, s94, 8
-; SI-NEXT: v_writelane_b32 v33, s83, 27
+; SI-NEXT: v_writelane_b32 v33, s31, 35
; SI-NEXT: s_lshr_b32 s30, s26, 16
; SI-NEXT: v_writelane_b32 v34, s95, 9
-; SI-NEXT: v_writelane_b32 v33, s84, 28
; SI-NEXT: s_lshr_b32 s31, s27, 16
; SI-NEXT: v_writelane_b32 v34, s30, 10
-; SI-NEXT: v_writelane_b32 v33, s85, 29
; SI-NEXT: s_lshr_b32 s34, s28, 16
; SI-NEXT: v_writelane_b32 v34, s31, 11
-; SI-NEXT: v_writelane_b32 v33, s86, 30
; SI-NEXT: s_lshr_b32 s35, s29, 16
; SI-NEXT: v_writelane_b32 v34, s34, 12
-; SI-NEXT: v_writelane_b32 v33, s87, 31
; SI-NEXT: v_readfirstlane_b32 s36, v4
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: v_writelane_b32 v34, s35, 13
-; SI-NEXT: v_writelane_b32 v33, s96, 32
; SI-NEXT: v_readfirstlane_b32 s37, v5
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_writelane_b32 v34, s36, 14
-; SI-NEXT: v_writelane_b32 v33, s97, 33
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17
; SI-NEXT: v_readfirstlane_b32 s47, v17
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16
@@ -219144,7 +219146,6 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; SI-NEXT: v_readfirstlane_b32 s5, v0
; SI-NEXT: v_writelane_b32 v34, s37, 15
-; SI-NEXT: v_writelane_b32 v33, s98, 34
; SI-NEXT: v_readfirstlane_b32 s41, v1
; SI-NEXT: v_readfirstlane_b32 s89, v19
; SI-NEXT: v_readfirstlane_b32 s88, v17
@@ -219165,7 +219166,6 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: v_readfirstlane_b32 s15, v3
; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2
; SI-NEXT: v_writelane_b32 v34, s5, 16
-; SI-NEXT: v_writelane_b32 v33, s99, 35
; SI-NEXT: v_writelane_b32 v34, vcc_lo, 17
; SI-NEXT: s_cbranch_scc0 .LBB107_2
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -219775,43 +219775,43 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: v_mul_f32_e64 v31, 1.0, s91
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31
; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4
+; SI-NEXT: v_readlane_b32 s30, v33, 34
; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16
-; SI-NEXT: v_readlane_b32 s99, v33, 35
-; SI-NEXT: v_readlane_b32 s98, v33, 34
-; SI-NEXT: v_readlane_b32 s97, v33, 33
-; SI-NEXT: v_readlane_b32 s96, v33, 32
-; SI-NEXT: v_readlane_b32 s87, v33, 31
-; SI-NEXT: v_readlane_b32 s86, v33, 30
-; SI-NEXT: v_readlane_b32 s85, v33, 29
-; SI-NEXT: v_readlane_b32 s84, v33, 28
-; SI-NEXT: v_readlane_b32 s83, v33, 27
-; SI-NEXT: v_readlane_b32 s82, v33, 26
-; SI-NEXT: v_readlane_b32 s81, v33, 25
-; SI-NEXT: v_readlane_b32 s80, v33, 24
-; SI-NEXT: v_readlane_b32 s71, v33, 23
-; SI-NEXT: v_readlane_b32 s70, v33, 22
-; SI-NEXT: v_readlane_b32 s69, v33, 21
-; SI-NEXT: v_readlane_b32 s68, v33, 20
-; SI-NEXT: v_readlane_b32 s67, v33, 19
-; SI-NEXT: v_readlane_b32 s66, v33, 18
-; SI-NEXT: v_readlane_b32 s65, v33, 17
-; SI-NEXT: v_readlane_b32 s64, v33, 16
-; SI-NEXT: v_readlane_b32 s55, v33, 15
-; SI-NEXT: v_readlane_b32 s54, v33, 14
-; SI-NEXT: v_readlane_b32 s53, v33, 13
-; SI-NEXT: v_readlane_b32 s52, v33, 12
-; SI-NEXT: v_readlane_b32 s51, v33, 11
-; SI-NEXT: v_readlane_b32 s50, v33, 10
-; SI-NEXT: v_readlane_b32 s49, v33, 9
-; SI-NEXT: v_readlane_b32 s48, v33, 8
-; SI-NEXT: v_readlane_b32 s39, v33, 7
-; SI-NEXT: v_readlane_b32 s38, v33, 6
-; SI-NEXT: v_readlane_b32 s37, v33, 5
-; SI-NEXT: v_readlane_b32 s36, v33, 4
-; SI-NEXT: v_readlane_b32 s35, v33, 3
-; SI-NEXT: v_readlane_b32 s34, v33, 2
-; SI-NEXT: v_readlane_b32 s31, v33, 1
-; SI-NEXT: v_readlane_b32 s30, v33, 0
+; SI-NEXT: v_readlane_b32 s31, v33, 35
+; SI-NEXT: v_readlane_b32 s99, v33, 33
+; SI-NEXT: v_readlane_b32 s98, v33, 32
+; SI-NEXT: v_readlane_b32 s97, v33, 31
+; SI-NEXT: v_readlane_b32 s96, v33, 30
+; SI-NEXT: v_readlane_b32 s87, v33, 29
+; SI-NEXT: v_readlane_b32 s86, v33, 28
+; SI-NEXT: v_readlane_b32 s85, v33, 27
+; SI-NEXT: v_readlane_b32 s84, v33, 26
+; SI-NEXT: v_readlane_b32 s83, v33, 25
+; SI-NEXT: v_readlane_b32 s82, v33, 24
+; SI-NEXT: v_readlane_b32 s81, v33, 23
+; SI-NEXT: v_readlane_b32 s80, v33, 22
+; SI-NEXT: v_readlane_b32 s71, v33, 21
+; SI-NEXT: v_readlane_b32 s70, v33, 20
+; SI-NEXT: v_readlane_b32 s69, v33, 19
+; SI-NEXT: v_readlane_b32 s68, v33, 18
+; SI-NEXT: v_readlane_b32 s67, v33, 17
+; SI-NEXT: v_readlane_b32 s66, v33, 16
+; SI-NEXT: v_readlane_b32 s65, v33, 15
+; SI-NEXT: v_readlane_b32 s64, v33, 14
+; SI-NEXT: v_readlane_b32 s55, v33, 13
+; SI-NEXT: v_readlane_b32 s54, v33, 12
+; SI-NEXT: v_readlane_b32 s53, v33, 11
+; SI-NEXT: v_readlane_b32 s52, v33, 10
+; SI-NEXT: v_readlane_b32 s51, v33, 9
+; SI-NEXT: v_readlane_b32 s50, v33, 8
+; SI-NEXT: v_readlane_b32 s49, v33, 7
+; SI-NEXT: v_readlane_b32 s48, v33, 6
+; SI-NEXT: v_readlane_b32 s39, v33, 5
+; SI-NEXT: v_readlane_b32 s38, v33, 4
+; SI-NEXT: v_readlane_b32 s37, v33, 3
+; SI-NEXT: v_readlane_b32 s36, v33, 2
+; SI-NEXT: v_readlane_b32 s35, v33, 1
+; SI-NEXT: v_readlane_b32 s34, v33, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -219825,12 +219825,12 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v32, s30, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 1
-; VI-NEXT: v_writelane_b32 v32, s34, 2
-; VI-NEXT: v_writelane_b32 v32, s35, 3
-; VI-NEXT: v_writelane_b32 v32, s36, 4
-; VI-NEXT: v_writelane_b32 v32, s37, 5
+; VI-NEXT: v_writelane_b32 v32, s34, 0
+; VI-NEXT: v_writelane_b32 v32, s35, 1
+; VI-NEXT: v_writelane_b32 v32, s36, 2
+; VI-NEXT: v_writelane_b32 v32, s37, 3
+; VI-NEXT: v_writelane_b32 v32, s38, 4
+; VI-NEXT: v_writelane_b32 v32, s39, 5
; VI-NEXT: v_mov_b32_e32 v19, s16
; VI-NEXT: v_readfirstlane_b32 s57, v2
; VI-NEXT: v_mov_b32_e32 v2, s17
@@ -219859,7 +219859,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; VI-NEXT: v_readfirstlane_b32 s22, v14
; VI-NEXT: v_mov_b32_e32 v14, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; VI-NEXT: v_writelane_b32 v32, s38, 6
+; VI-NEXT: v_writelane_b32 v32, s30, 6
; VI-NEXT: v_readfirstlane_b32 s20, v15
; VI-NEXT: v_readfirstlane_b32 s18, v16
; VI-NEXT: v_readfirstlane_b32 s16, v17
@@ -219880,7 +219880,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; VI-NEXT: v_readfirstlane_b32 s6, v0
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s9, v1
-; VI-NEXT: v_writelane_b32 v32, s39, 7
+; VI-NEXT: v_writelane_b32 v32, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB107_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB107_3
@@ -220046,6 +220046,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; VI-NEXT: s_add_i32 s56, s28, 0x30000
; VI-NEXT: s_add_i32 s57, s4, 0x30000
; VI-NEXT: .LBB107_3: ; %end
+; VI-NEXT: v_readlane_b32 s30, v32, 6
; VI-NEXT: v_mov_b32_e32 v0, s27
; VI-NEXT: v_mov_b32_e32 v1, s25
; VI-NEXT: v_mov_b32_e32 v2, s23
@@ -220078,14 +220079,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; VI-NEXT: v_mov_b32_e32 v29, s20
; VI-NEXT: v_mov_b32_e32 v30, s18
; VI-NEXT: v_mov_b32_e32 v31, s16
-; VI-NEXT: v_readlane_b32 s39, v32, 7
-; VI-NEXT: v_readlane_b32 s38, v32, 6
-; VI-NEXT: v_readlane_b32 s37, v32, 5
-; VI-NEXT: v_readlane_b32 s36, v32, 4
-; VI-NEXT: v_readlane_b32 s35, v32, 3
-; VI-NEXT: v_readlane_b32 s34, v32, 2
-; VI-NEXT: v_readlane_b32 s31, v32, 1
-; VI-NEXT: v_readlane_b32 s30, v32, 0
+; VI-NEXT: v_readlane_b32 s31, v32, 7
+; VI-NEXT: v_readlane_b32 s39, v32, 5
+; VI-NEXT: v_readlane_b32 s38, v32, 4
+; VI-NEXT: v_readlane_b32 s37, v32, 3
+; VI-NEXT: v_readlane_b32 s36, v32, 2
+; VI-NEXT: v_readlane_b32 s35, v32, 1
+; VI-NEXT: v_readlane_b32 s34, v32, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -223131,60 +223131,60 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v32, s30, 0
-; SI-NEXT: v_writelane_b32 v32, s31, 1
-; SI-NEXT: v_writelane_b32 v32, s34, 2
-; SI-NEXT: v_writelane_b32 v32, s35, 3
-; SI-NEXT: v_writelane_b32 v32, s36, 4
-; SI-NEXT: v_writelane_b32 v32, s37, 5
-; SI-NEXT: v_writelane_b32 v32, s38, 6
-; SI-NEXT: v_writelane_b32 v32, s39, 7
-; SI-NEXT: v_writelane_b32 v32, s48, 8
-; SI-NEXT: v_writelane_b32 v32, s49, 9
-; SI-NEXT: v_writelane_b32 v32, s50, 10
-; SI-NEXT: v_writelane_b32 v32, s51, 11
-; SI-NEXT: v_writelane_b32 v32, s52, 12
-; SI-NEXT: v_writelane_b32 v32, s53, 13
-; SI-NEXT: v_writelane_b32 v32, s54, 14
-; SI-NEXT: v_writelane_b32 v32, s55, 15
-; SI-NEXT: v_writelane_b32 v32, s64, 16
-; SI-NEXT: v_writelane_b32 v32, s65, 17
-; SI-NEXT: v_writelane_b32 v32, s66, 18
-; SI-NEXT: v_writelane_b32 v32, s67, 19
-; SI-NEXT: v_writelane_b32 v32, s68, 20
-; SI-NEXT: v_writelane_b32 v32, s69, 21
-; SI-NEXT: v_writelane_b32 v32, s70, 22
-; SI-NEXT: v_writelane_b32 v32, s71, 23
-; SI-NEXT: v_writelane_b32 v32, s80, 24
+; SI-NEXT: v_writelane_b32 v32, s34, 0
+; SI-NEXT: v_writelane_b32 v32, s35, 1
+; SI-NEXT: v_writelane_b32 v32, s36, 2
+; SI-NEXT: v_writelane_b32 v32, s37, 3
+; SI-NEXT: v_writelane_b32 v32, s38, 4
+; SI-NEXT: v_writelane_b32 v32, s39, 5
+; SI-NEXT: v_writelane_b32 v32, s48, 6
+; SI-NEXT: v_writelane_b32 v32, s49, 7
+; SI-NEXT: v_writelane_b32 v32, s50, 8
+; SI-NEXT: v_writelane_b32 v32, s51, 9
+; SI-NEXT: v_writelane_b32 v32, s52, 10
+; SI-NEXT: v_writelane_b32 v32, s53, 11
+; SI-NEXT: v_writelane_b32 v32, s54, 12
+; SI-NEXT: v_writelane_b32 v32, s55, 13
+; SI-NEXT: v_writelane_b32 v32, s64, 14
+; SI-NEXT: v_writelane_b32 v32, s65, 15
+; SI-NEXT: v_writelane_b32 v32, s66, 16
+; SI-NEXT: v_writelane_b32 v32, s67, 17
+; SI-NEXT: v_writelane_b32 v32, s68, 18
+; SI-NEXT: v_writelane_b32 v32, s69, 19
+; SI-NEXT: v_writelane_b32 v32, s70, 20
+; SI-NEXT: v_writelane_b32 v32, s71, 21
+; SI-NEXT: v_writelane_b32 v32, s80, 22
+; SI-NEXT: v_writelane_b32 v32, s81, 23
+; SI-NEXT: v_writelane_b32 v32, s82, 24
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v32, s81, 25
+; SI-NEXT: v_writelane_b32 v32, s83, 25
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v33, s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v7
-; SI-NEXT: v_writelane_b32 v32, s82, 26
+; SI-NEXT: v_writelane_b32 v32, s84, 26
; SI-NEXT: v_writelane_b32 v33, s4, 1
; SI-NEXT: v_readfirstlane_b32 s4, v5
-; SI-NEXT: v_writelane_b32 v32, s83, 27
+; SI-NEXT: v_writelane_b32 v32, s85, 27
; SI-NEXT: v_writelane_b32 v33, s4, 2
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v32, s84, 28
+; SI-NEXT: v_writelane_b32 v32, s86, 28
; SI-NEXT: v_writelane_b32 v33, s4, 3
-; SI-NEXT: v_writelane_b32 v32, s85, 29
+; SI-NEXT: v_writelane_b32 v32, s87, 29
; SI-NEXT: v_writelane_b32 v33, s29, 4
; SI-NEXT: s_lshr_b32 s4, s28, 16
-; SI-NEXT: v_writelane_b32 v32, s86, 30
+; SI-NEXT: v_writelane_b32 v32, s96, 30
; SI-NEXT: v_writelane_b32 v33, s4, 5
-; SI-NEXT: v_writelane_b32 v32, s87, 31
+; SI-NEXT: v_writelane_b32 v32, s97, 31
; SI-NEXT: v_writelane_b32 v33, s27, 6
; SI-NEXT: s_lshr_b32 s4, s26, 16
-; SI-NEXT: v_writelane_b32 v32, s96, 32
+; SI-NEXT: v_writelane_b32 v32, s98, 32
; SI-NEXT: v_writelane_b32 v33, s4, 7
-; SI-NEXT: v_writelane_b32 v32, s97, 33
+; SI-NEXT: v_writelane_b32 v32, s99, 33
; SI-NEXT: v_writelane_b32 v33, s25, 8
-; SI-NEXT: v_writelane_b32 v32, s98, 34
+; SI-NEXT: v_writelane_b32 v32, s30, 34
; SI-NEXT: v_writelane_b32 v33, s23, 9
-; SI-NEXT: v_writelane_b32 v32, s99, 35
+; SI-NEXT: v_writelane_b32 v32, s31, 35
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: s_lshr_b32 s48, s29, 16
; SI-NEXT: s_lshr_b32 s76, s27, 16
@@ -223811,6 +223811,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_lshl_b32 s46, s96, 16
; SI-NEXT: s_or_b32 s5, s5, s46
+; SI-NEXT: v_readlane_b32 s30, v32, 34
; SI-NEXT: v_readlane_b32 s57, v33, 23
; SI-NEXT: v_readlane_b32 s47, v33, 33
; SI-NEXT: v_mov_b32_e32 v0, s16
@@ -223845,42 +223846,41 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v29, s7
; SI-NEXT: v_mov_b32_e32 v30, s4
; SI-NEXT: v_mov_b32_e32 v31, s5
-; SI-NEXT: v_readlane_b32 s99, v32, 35
-; SI-NEXT: v_readlane_b32 s98, v32, 34
-; SI-NEXT: v_readlane_b32 s97, v32, 33
-; SI-NEXT: v_readlane_b32 s96, v32, 32
-; SI-NEXT: v_readlane_b32 s87, v32, 31
-; SI-NEXT: v_readlane_b32 s86, v32, 30
-; SI-NEXT: v_readlane_b32 s85, v32, 29
-; SI-NEXT: v_readlane_b32 s84, v32, 28
-; SI-NEXT: v_readlane_b32 s83, v32, 27
-; SI-NEXT: v_readlane_b32 s82, v32, 26
-; SI-NEXT: v_readlane_b32 s81, v32, 25
-; SI-NEXT: v_readlane_b32 s80, v32, 24
-; SI-NEXT: v_readlane_b32 s71, v32, 23
-; SI-NEXT: v_readlane_b32 s70, v32, 22
-; SI-NEXT: v_readlane_b32 s69, v32, 21
-; SI-NEXT: v_readlane_b32 s68, v32, 20
-; SI-NEXT: v_readlane_b32 s67, v32, 19
-; SI-NEXT: v_readlane_b32 s66, v32, 18
-; SI-NEXT: v_readlane_b32 s65, v32, 17
-; SI-NEXT: v_readlane_b32 s64, v32, 16
-; SI-NEXT: v_readlane_b32 s55, v32, 15
-; SI-NEXT: v_readlane_b32 s54, v32, 14
-; SI-NEXT: v_readlane_b32 s53, v32, 13
-; SI-NEXT: v_readlane_b32 s52, v32, 12
-; SI-NEXT: v_readlane_b32 s51, v32, 11
-; SI-NEXT: v_readlane_b32 s50, v32, 10
-; SI-NEXT: v_readlane_b32 s49, v32, 9
-; SI-NEXT: v_readlane_b32 s48, v32, 8
-; SI-NEXT: v_readlane_b32 s39, v32, 7
-; SI-NEXT: v_readlane_b32 s38, v32, 6
-; SI-NEXT: v_readlane_b32 s37, v32, 5
-; SI-NEXT: v_readlane_b32 s36, v32, 4
-; SI-NEXT: v_readlane_b32 s35, v32, 3
-; SI-NEXT: v_readlane_b32 s34, v32, 2
-; SI-NEXT: v_readlane_b32 s31, v32, 1
-; SI-NEXT: v_readlane_b32 s30, v32, 0
+; SI-NEXT: v_readlane_b32 s31, v32, 35
+; SI-NEXT: v_readlane_b32 s99, v32, 33
+; SI-NEXT: v_readlane_b32 s98, v32, 32
+; SI-NEXT: v_readlane_b32 s97, v32, 31
+; SI-NEXT: v_readlane_b32 s96, v32, 30
+; SI-NEXT: v_readlane_b32 s87, v32, 29
+; SI-NEXT: v_readlane_b32 s86, v32, 28
+; SI-NEXT: v_readlane_b32 s85, v32, 27
+; SI-NEXT: v_readlane_b32 s84, v32, 26
+; SI-NEXT: v_readlane_b32 s83, v32, 25
+; SI-NEXT: v_readlane_b32 s82, v32, 24
+; SI-NEXT: v_readlane_b32 s81, v32, 23
+; SI-NEXT: v_readlane_b32 s80, v32, 22
+; SI-NEXT: v_readlane_b32 s71, v32, 21
+; SI-NEXT: v_readlane_b32 s70, v32, 20
+; SI-NEXT: v_readlane_b32 s69, v32, 19
+; SI-NEXT: v_readlane_b32 s68, v32, 18
+; SI-NEXT: v_readlane_b32 s67, v32, 17
+; SI-NEXT: v_readlane_b32 s66, v32, 16
+; SI-NEXT: v_readlane_b32 s65, v32, 15
+; SI-NEXT: v_readlane_b32 s64, v32, 14
+; SI-NEXT: v_readlane_b32 s55, v32, 13
+; SI-NEXT: v_readlane_b32 s54, v32, 12
+; SI-NEXT: v_readlane_b32 s53, v32, 11
+; SI-NEXT: v_readlane_b32 s52, v32, 10
+; SI-NEXT: v_readlane_b32 s51, v32, 9
+; SI-NEXT: v_readlane_b32 s50, v32, 8
+; SI-NEXT: v_readlane_b32 s49, v32, 7
+; SI-NEXT: v_readlane_b32 s48, v32, 6
+; SI-NEXT: v_readlane_b32 s39, v32, 5
+; SI-NEXT: v_readlane_b32 s38, v32, 4
+; SI-NEXT: v_readlane_b32 s37, v32, 3
+; SI-NEXT: v_readlane_b32 s36, v32, 2
+; SI-NEXT: v_readlane_b32 s35, v32, 1
+; SI-NEXT: v_readlane_b32 s34, v32, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -223940,12 +223940,12 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v32, s30, 0
-; VI-NEXT: v_writelane_b32 v32, s31, 1
-; VI-NEXT: v_writelane_b32 v32, s34, 2
-; VI-NEXT: v_writelane_b32 v32, s35, 3
-; VI-NEXT: v_writelane_b32 v32, s36, 4
-; VI-NEXT: v_writelane_b32 v32, s37, 5
+; VI-NEXT: v_writelane_b32 v32, s34, 0
+; VI-NEXT: v_writelane_b32 v32, s35, 1
+; VI-NEXT: v_writelane_b32 v32, s36, 2
+; VI-NEXT: v_writelane_b32 v32, s37, 3
+; VI-NEXT: v_writelane_b32 v32, s38, 4
+; VI-NEXT: v_writelane_b32 v32, s39, 5
; VI-NEXT: v_mov_b32_e32 v19, s16
; VI-NEXT: v_readfirstlane_b32 s57, v2
; VI-NEXT: v_mov_b32_e32 v2, s17
@@ -223974,7 +223974,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s22, v14
; VI-NEXT: v_mov_b32_e32 v14, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; VI-NEXT: v_writelane_b32 v32, s38, 6
+; VI-NEXT: v_writelane_b32 v32, s30, 6
; VI-NEXT: v_readfirstlane_b32 s20, v15
; VI-NEXT: v_readfirstlane_b32 s18, v16
; VI-NEXT: v_readfirstlane_b32 s16, v17
@@ -223995,7 +223995,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s6, v0
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s9, v1
-; VI-NEXT: v_writelane_b32 v32, s39, 7
+; VI-NEXT: v_writelane_b32 v32, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB111_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB111_3
@@ -224161,6 +224161,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; VI-NEXT: s_add_i32 s56, s28, 0x30000
; VI-NEXT: s_add_i32 s57, s4, 0x30000
; VI-NEXT: .LBB111_3: ; %end
+; VI-NEXT: v_readlane_b32 s30, v32, 6
; VI-NEXT: v_mov_b32_e32 v0, s27
; VI-NEXT: v_mov_b32_e32 v1, s25
; VI-NEXT: v_mov_b32_e32 v2, s23
@@ -224193,14 +224194,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v29, s20
; VI-NEXT: v_mov_b32_e32 v30, s18
; VI-NEXT: v_mov_b32_e32 v31, s16
-; VI-NEXT: v_readlane_b32 s39, v32, 7
-; VI-NEXT: v_readlane_b32 s38, v32, 6
-; VI-NEXT: v_readlane_b32 s37, v32, 5
-; VI-NEXT: v_readlane_b32 s36, v32, 4
-; VI-NEXT: v_readlane_b32 s35, v32, 3
-; VI-NEXT: v_readlane_b32 s34, v32, 2
-; VI-NEXT: v_readlane_b32 s31, v32, 1
-; VI-NEXT: v_readlane_b32 s30, v32, 0
+; VI-NEXT: v_readlane_b32 s31, v32, 7
+; VI-NEXT: v_readlane_b32 s39, v32, 5
+; VI-NEXT: v_readlane_b32 s38, v32, 4
+; VI-NEXT: v_readlane_b32 s37, v32, 3
+; VI-NEXT: v_readlane_b32 s36, v32, 2
+; VI-NEXT: v_readlane_b32 s35, v32, 1
+; VI-NEXT: v_readlane_b32 s34, v32, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 96be2a0deb9b4..1d4f749fd1890 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) {
+define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -88,7 +88,7 @@ end:
ret <4 x float> %phi
}
-define inreg <4 x float> @bitcast_v4i32_to_v4f32_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v4i32_to_v4f32_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v4f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -190,7 +190,7 @@ end:
ret <4 x float> %phi
}
-define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) {
+define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -270,7 +270,7 @@ end:
ret <4 x i32> %phi
}
-define inreg <4 x i32> @bitcast_v4f32_to_v4i32_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v4f32_to_v4i32_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v4i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,7 +375,7 @@ end:
ret <4 x i32> %phi
}
-define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) {
+define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -457,7 +457,7 @@ end:
ret <2 x i64> %phi
}
-define inreg <2 x i64> @bitcast_v4i32_to_v2i64_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v4i32_to_v2i64_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v2i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -559,7 +559,7 @@ end:
ret <2 x i64> %phi
}
-define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) {
+define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -642,7 +642,7 @@ end:
ret <4 x i32> %phi
}
-define inreg <4 x i32> @bitcast_v2i64_to_v4i32_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v2i64_to_v4i32_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v4i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -744,7 +744,7 @@ end:
ret <4 x i32> %phi
}
-define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) {
+define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,7 +826,7 @@ end:
ret <2 x double> %phi
}
-define inreg <2 x double> @bitcast_v4i32_to_v2f64_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v4i32_to_v2f64_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v2f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -928,7 +928,7 @@ end:
ret <2 x double> %phi
}
-define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) {
+define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1005,7 +1005,7 @@ end:
ret <4 x i32> %phi
}
-define inreg <4 x i32> @bitcast_v2f64_to_v4i32_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v2f64_to_v4i32_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v4i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1102,7 +1102,7 @@ end:
ret <4 x i32> %phi
}
-define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) {
+define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1210,7 +1210,7 @@ end:
ret <8 x i16> %phi
}
-define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v8i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1336,7 +1336,7 @@ end:
ret <8 x i16> %phi
}
-define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) {
+define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1477,7 +1477,7 @@ end:
ret <4 x i32> %phi
}
-define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v4i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1629,7 +1629,7 @@ end:
ret <4 x i32> %phi
}
-define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) {
+define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1737,7 +1737,7 @@ end:
ret <8 x half> %phi
}
-define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v8f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1863,7 +1863,7 @@ end:
ret <8 x half> %phi
}
-define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) {
+define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2021,7 +2021,7 @@ end:
ret <4 x i32> %phi
}
-define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v4i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2189,7 +2189,7 @@ end:
ret <4 x i32> %phi
}
-define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v8bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2318,7 +2318,7 @@ end:
ret <8 x bfloat> %phi
}
-define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v8bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2456,7 +2456,7 @@ end:
ret <8 x bfloat> %phi
}
-define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
+define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2880,7 +2880,7 @@ end:
ret <4 x i32> %phi
}
-define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v4i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3349,7 +3349,7 @@ end:
ret <4 x i32> %phi
}
-define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) {
+define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v16i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3675,7 +3675,7 @@ end:
ret <16 x i8> %phi
}
-define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i32_to_v16i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3963,7 +3963,7 @@ end:
ret <16 x i8> %phi
}
-define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
+define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4488,7 +4488,7 @@ end:
ret <4 x i32> %phi
}
-define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v4i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4851,7 +4851,7 @@ end:
ret <4 x i32> %phi
}
-define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) {
+define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4931,7 +4931,7 @@ end:
ret <2 x i64> %phi
}
-define inreg <2 x i64> @bitcast_v4f32_to_v2i64_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v4f32_to_v2i64_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v2i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5036,7 +5036,7 @@ end:
ret <2 x i64> %phi
}
-define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) {
+define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5119,7 +5119,7 @@ end:
ret <4 x float> %phi
}
-define inreg <4 x float> @bitcast_v2i64_to_v4f32_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v2i64_to_v4f32_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v4f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5221,7 +5221,7 @@ end:
ret <4 x float> %phi
}
-define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) {
+define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5301,7 +5301,7 @@ end:
ret <2 x double> %phi
}
-define inreg <2 x double> @bitcast_v4f32_to_v2f64_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v4f32_to_v2f64_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v2f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5406,7 +5406,7 @@ end:
ret <2 x double> %phi
}
-define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) {
+define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5483,7 +5483,7 @@ end:
ret <4 x float> %phi
}
-define inreg <4 x float> @bitcast_v2f64_to_v4f32_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v2f64_to_v4f32_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v4f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5580,7 +5580,7 @@ end:
ret <4 x float> %phi
}
-define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) {
+define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5686,7 +5686,7 @@ end:
ret <8 x i16> %phi
}
-define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v8i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5820,7 +5820,7 @@ end:
ret <8 x i16> %phi
}
-define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) {
+define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5961,7 +5961,7 @@ end:
ret <4 x float> %phi
}
-define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v4f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6113,7 +6113,7 @@ end:
ret <4 x float> %phi
}
-define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) {
+define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6219,7 +6219,7 @@ end:
ret <8 x half> %phi
}
-define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v8f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6353,7 +6353,7 @@ end:
ret <8 x half> %phi
}
-define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) {
+define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6511,7 +6511,7 @@ end:
ret <4 x float> %phi
}
-define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v4f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6679,7 +6679,7 @@ end:
ret <4 x float> %phi
}
-define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v8bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6806,7 +6806,7 @@ end:
ret <8 x bfloat> %phi
}
-define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v8bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6956,7 +6956,7 @@ end:
ret <8 x bfloat> %phi
}
-define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
+define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7380,7 +7380,7 @@ end:
ret <4 x float> %phi
}
-define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v4f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7849,7 +7849,7 @@ end:
ret <4 x float> %phi
}
-define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) {
+define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v16i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8171,7 +8171,7 @@ end:
ret <16 x i8> %phi
}
-define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f32_to_v16i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8488,7 +8488,7 @@ end:
ret <16 x i8> %phi
}
-define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
+define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9013,7 +9013,7 @@ end:
ret <4 x float> %phi
}
-define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v4f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9376,7 +9376,7 @@ end:
ret <4 x float> %phi
}
-define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) {
+define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9459,7 +9459,7 @@ end:
ret <2 x double> %phi
}
-define inreg <2 x double> @bitcast_v2i64_to_v2f64_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v2i64_to_v2f64_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v2f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9560,7 +9560,7 @@ end:
ret <2 x double> %phi
}
-define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) {
+define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9637,7 +9637,7 @@ end:
ret <2 x i64> %phi
}
-define inreg <2 x i64> @bitcast_v2f64_to_v2i64_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v2f64_to_v2i64_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v2i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9734,7 +9734,7 @@ end:
ret <2 x i64> %phi
}
-define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) {
+define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9843,7 +9843,7 @@ end:
ret <8 x i16> %phi
}
-define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v8i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9969,7 +9969,7 @@ end:
ret <8 x i16> %phi
}
-define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) {
+define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10110,7 +10110,7 @@ end:
ret <2 x i64> %phi
}
-define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v2i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10262,7 +10262,7 @@ end:
ret <2 x i64> %phi
}
-define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) {
+define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10371,7 +10371,7 @@ end:
ret <8 x half> %phi
}
-define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v8f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10497,7 +10497,7 @@ end:
ret <8 x half> %phi
}
-define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) {
+define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10655,7 +10655,7 @@ end:
ret <2 x i64> %phi
}
-define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v2i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10823,7 +10823,7 @@ end:
ret <2 x i64> %phi
}
-define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v8bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10953,7 +10953,7 @@ end:
ret <8 x bfloat> %phi
}
-define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v8bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11091,7 +11091,7 @@ end:
ret <8 x bfloat> %phi
}
-define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
+define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11515,7 +11515,7 @@ end:
ret <2 x i64> %phi
}
-define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v2i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11984,7 +11984,7 @@ end:
ret <2 x i64> %phi
}
-define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) {
+define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v16i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12310,7 +12310,7 @@ end:
ret <16 x i8> %phi
}
-define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i64_to_v16i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12598,7 +12598,7 @@ end:
ret <16 x i8> %phi
}
-define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
+define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13123,7 +13123,7 @@ end:
ret <2 x i64> %phi
}
-define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v2i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13486,7 +13486,7 @@ end:
ret <2 x i64> %phi
}
-define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) {
+define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13589,7 +13589,7 @@ end:
ret <8 x i16> %phi
}
-define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v8i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13715,7 +13715,7 @@ end:
ret <8 x i16> %phi
}
-define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) {
+define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13856,7 +13856,7 @@ end:
ret <2 x double> %phi
}
-define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v2f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14008,7 +14008,7 @@ end:
ret <2 x double> %phi
}
-define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) {
+define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14111,7 +14111,7 @@ end:
ret <8 x half> %phi
}
-define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v8f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14237,7 +14237,7 @@ end:
ret <8 x half> %phi
}
-define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) {
+define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14395,7 +14395,7 @@ end:
ret <2 x double> %phi
}
-define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v2f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14563,7 +14563,7 @@ end:
ret <2 x double> %phi
}
-define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v8bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14685,7 +14685,7 @@ end:
ret <8 x bfloat> %phi
}
-define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v8bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14827,7 +14827,7 @@ end:
ret <8 x bfloat> %phi
}
-define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
+define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15251,7 +15251,7 @@ end:
ret <2 x double> %phi
}
-define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v2f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15720,7 +15720,7 @@ end:
ret <2 x double> %phi
}
-define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) {
+define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v16i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16038,7 +16038,7 @@ end:
ret <16 x i8> %phi
}
-define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f64_to_v16i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16346,7 +16346,7 @@ end:
ret <16 x i8> %phi
}
-define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
+define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16871,7 +16871,7 @@ end:
ret <2 x double> %phi
}
-define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v2f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17234,7 +17234,7 @@ end:
ret <2 x double> %phi
}
-define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) {
+define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17391,7 +17391,7 @@ end:
ret <8 x half> %phi
}
-define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v8f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17566,7 +17566,7 @@ end:
ret <8 x half> %phi
}
-define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) {
+define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17706,7 +17706,7 @@ end:
ret <8 x i16> %phi
}
-define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v8i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17880,7 +17880,7 @@ end:
ret <8 x i16> %phi
}
-define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v8bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18032,7 +18032,7 @@ end:
ret <8 x bfloat> %phi
}
-define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v8bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18207,7 +18207,7 @@ end:
ret <8 x bfloat> %phi
}
-define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
+define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18641,7 +18641,7 @@ end:
ret <8 x i16> %phi
}
-define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v8i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19114,7 +19114,7 @@ end:
ret <8 x i16> %phi
}
-define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) {
+define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v16i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19494,7 +19494,7 @@ end:
ret <16 x i8> %phi
}
-define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i16_to_v16i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19843,7 +19843,7 @@ end:
ret <16 x i8> %phi
}
-define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
+define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20389,7 +20389,7 @@ end:
ret <8 x i16> %phi
}
-define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v8i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20779,7 +20779,7 @@ end:
ret <8 x i16> %phi
}
-define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v8bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20947,7 +20947,7 @@ end:
ret <8 x bfloat> %phi
}
-define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v8bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21139,7 +21139,7 @@ end:
ret <8 x bfloat> %phi
}
-define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
+define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21586,7 +21586,7 @@ end:
ret <8 x half> %phi
}
-define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v8f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22084,7 +22084,7 @@ end:
ret <8 x half> %phi
}
-define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) {
+define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v16i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22468,7 +22468,7 @@ end:
ret <16 x i8> %phi
}
-define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f16_to_v16i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22837,7 +22837,7 @@ end:
ret <16 x i8> %phi
}
-define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
+define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23383,7 +23383,7 @@ end:
ret <8 x half> %phi
}
-define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v8f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23773,7 +23773,7 @@ end:
ret <8 x half> %phi
}
-define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
+define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v16i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24432,7 +24432,7 @@ end:
ret <16 x i8> %phi
}
-define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8bf16_to_v16i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25138,7 +25138,7 @@ end:
ret <16 x i8> %phi
}
-define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
+define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v8bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25686,7 +25686,7 @@ end:
ret <8 x bfloat> %phi
}
-define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i8_to_v8bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26071,3 +26071,5 @@ end:
%phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <8 x bfloat> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
index 430a93d9e9bf0..c09389ef700ac 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) {
+define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i32_to_v5f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -92,7 +92,7 @@ end:
ret <5 x float> %phi
}
-define inreg <5 x float> @bitcast_v5i32_to_v5f32_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v5i32_to_v5f32_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i32_to_v5f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,7 +202,7 @@ end:
ret <5 x float> %phi
}
-define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) {
+define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f32_to_v5i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -286,7 +286,7 @@ end:
ret <5 x i32> %phi
}
-define inreg <5 x i32> @bitcast_v5f32_to_v5i32_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v5f32_to_v5i32_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f32_to_v5i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -400,7 +400,7 @@ end:
ret <5 x i32> %phi
}
-define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) {
+define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i32_to_v10i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -519,7 +519,7 @@ end:
ret <10 x i16> %phi
}
-define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i32_to_v10i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -659,7 +659,7 @@ end:
ret <10 x i16> %phi
}
-define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) {
+define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i16_to_v5i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -817,7 +817,7 @@ end:
ret <5 x i32> %phi
}
-define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i16_to_v5i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -990,7 +990,7 @@ end:
ret <5 x i32> %phi
}
-define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) {
+define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i32_to_v10f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1109,7 +1109,7 @@ end:
ret <10 x half> %phi
}
-define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i32_to_v10f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1249,7 +1249,7 @@ end:
ret <10 x half> %phi
}
-define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) {
+define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f16_to_v5i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1427,7 +1427,7 @@ end:
ret <5 x i32> %phi
}
-define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f16_to_v5i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1619,7 +1619,7 @@ end:
ret <5 x i32> %phi
}
-define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) {
+define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f32_to_v10i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1736,7 +1736,7 @@ end:
ret <10 x i16> %phi
}
-define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f32_to_v10i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1893,7 +1893,7 @@ end:
ret <10 x i16> %phi
}
-define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) {
+define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i16_to_v5f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2051,7 +2051,7 @@ end:
ret <5 x float> %phi
}
-define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i16_to_v5f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2224,7 +2224,7 @@ end:
ret <5 x float> %phi
}
-define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) {
+define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f32_to_v10f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2341,7 +2341,7 @@ end:
ret <10 x half> %phi
}
-define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f32_to_v10f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2498,7 +2498,7 @@ end:
ret <10 x half> %phi
}
-define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) {
+define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f16_to_v5f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2676,7 +2676,7 @@ end:
ret <5 x float> %phi
}
-define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f16_to_v5f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2868,7 +2868,7 @@ end:
ret <5 x float> %phi
}
-define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) {
+define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i16_to_v10f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3045,7 +3045,7 @@ end:
ret <10 x half> %phi
}
-define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i16_to_v10f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3250,7 +3250,7 @@ end:
ret <10 x half> %phi
}
-define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) {
+define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f16_to_v10i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3407,7 +3407,7 @@ end:
ret <10 x i16> %phi
}
-define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f16_to_v10i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3612,6 +3612,9 @@ end:
%phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <10 x i16> %phi
}
+
+attributes #0 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-FAKE16: {{.*}}
; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
index b6b321a08f7aa..73d5bfa812b08 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define half @bitcast_i16_to_f16(i16 %a, i32 %b) {
+define half @bitcast_i16_to_f16(i16 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i16_to_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,7 +99,7 @@ end:
ret half %phi
}
-define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) {
+define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i16_to_f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -181,7 +181,7 @@ end:
ret half %phi
}
-define i16 @bitcast_f16_to_i16(half %a, i32 %b) {
+define i16 @bitcast_f16_to_i16(half %a, i32 %b) #0 {
; SI-LABEL: bitcast_f16_to_i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -287,7 +287,7 @@ end:
ret i16 %phi
}
-define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) {
+define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f16_to_i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -394,7 +394,7 @@ end:
ret i16 %phi
}
-define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) {
+define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i16_to_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -490,7 +490,7 @@ end:
ret bfloat %phi
}
-define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) {
+define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i16_to_bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -576,7 +576,7 @@ end:
ret bfloat %phi
}
-define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
+define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) #0 {
; SI-LABEL: bitcast_bf16_to_i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -720,7 +720,7 @@ end:
ret i16 %phi
}
-define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) {
+define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_bf16_to_i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -836,7 +836,7 @@ end:
ret i16 %phi
}
-define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) {
+define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) #0 {
; SI-LABEL: bitcast_f16_to_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -939,7 +939,7 @@ end:
ret bfloat %phi
}
-define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) {
+define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f16_to_bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1050,7 +1050,7 @@ end:
ret bfloat %phi
}
-define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
+define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) #0 {
; SI-LABEL: bitcast_bf16_to_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1194,7 +1194,7 @@ end:
ret half %phi
}
-define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) {
+define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_bf16_to_f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1309,3 +1309,5 @@ end:
%phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret half %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
index d463b115d1088..647b212d4d0bf 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) {
+define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v6f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -96,7 +96,7 @@ end:
ret <6 x float> %phi
}
-define inreg <6 x float> @bitcast_v6i32_to_v6f32_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v6i32_to_v6f32_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v6f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -213,7 +213,7 @@ end:
ret <6 x float> %phi
}
-define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) {
+define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v6i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -300,7 +300,7 @@ end:
ret <6 x i32> %phi
}
-define inreg <6 x i32> @bitcast_v6f32_to_v6i32_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v6f32_to_v6i32_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v6i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -422,7 +422,7 @@ end:
ret <6 x i32> %phi
}
-define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) {
+define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v3i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -512,7 +512,7 @@ end:
ret <3 x i64> %phi
}
-define inreg <3 x i64> @bitcast_v6i32_to_v3i64_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v6i32_to_v3i64_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v3i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -629,7 +629,7 @@ end:
ret <3 x i64> %phi
}
-define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) {
+define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v6i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -721,7 +721,7 @@ end:
ret <6 x i32> %phi
}
-define inreg <6 x i32> @bitcast_v3i64_to_v6i32_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v3i64_to_v6i32_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v6i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -838,7 +838,7 @@ end:
ret <6 x i32> %phi
}
-define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) {
+define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v3f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -928,7 +928,7 @@ end:
ret <3 x double> %phi
}
-define inreg <3 x double> @bitcast_v6i32_to_v3f64_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v6i32_to_v3f64_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v3f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1045,7 +1045,7 @@ end:
ret <3 x double> %phi
}
-define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) {
+define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v6i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1126,7 +1126,7 @@ end:
ret <6 x i32> %phi
}
-define inreg <6 x i32> @bitcast_v3f64_to_v6i32_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v3f64_to_v6i32_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v6i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1236,7 +1236,7 @@ end:
ret <6 x i32> %phi
}
-define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) {
+define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v12i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1365,7 +1365,7 @@ end:
ret <12 x i16> %phi
}
-define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v12i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1518,7 +1518,7 @@ end:
ret <12 x i16> %phi
}
-define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) {
+define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v6i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1692,7 +1692,7 @@ end:
ret <6 x i32> %phi
}
-define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v6i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1885,7 +1885,7 @@ end:
ret <6 x i32> %phi
}
-define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) {
+define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v12f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2014,7 +2014,7 @@ end:
ret <12 x half> %phi
}
-define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i32_to_v12f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2167,7 +2167,7 @@ end:
ret <12 x half> %phi
}
-define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) {
+define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v6i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2365,7 +2365,7 @@ end:
ret <6 x i32> %phi
}
-define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v6i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2580,7 +2580,7 @@ end:
ret <6 x i32> %phi
}
-define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) {
+define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v3i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2667,7 +2667,7 @@ end:
ret <3 x i64> %phi
}
-define inreg <3 x i64> @bitcast_v6f32_to_v3i64_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v6f32_to_v3i64_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v3i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2789,7 +2789,7 @@ end:
ret <3 x i64> %phi
}
-define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) {
+define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v6f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2881,7 +2881,7 @@ end:
ret <6 x float> %phi
}
-define inreg <6 x float> @bitcast_v3i64_to_v6f32_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v3i64_to_v6f32_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v6f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2998,7 +2998,7 @@ end:
ret <6 x float> %phi
}
-define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) {
+define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v3f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3085,7 +3085,7 @@ end:
ret <3 x double> %phi
}
-define inreg <3 x double> @bitcast_v6f32_to_v3f64_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v6f32_to_v3f64_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v3f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3207,7 +3207,7 @@ end:
ret <3 x double> %phi
}
-define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) {
+define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v6f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3288,7 +3288,7 @@ end:
ret <6 x float> %phi
}
-define inreg <6 x float> @bitcast_v3f64_to_v6f32_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v3f64_to_v6f32_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v6f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3398,7 +3398,7 @@ end:
ret <6 x float> %phi
}
-define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) {
+define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v12i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3524,7 +3524,7 @@ end:
ret <12 x i16> %phi
}
-define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v12i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3694,7 +3694,7 @@ end:
ret <12 x i16> %phi
}
-define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) {
+define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v6f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3868,7 +3868,7 @@ end:
ret <6 x float> %phi
}
-define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v6f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4061,7 +4061,7 @@ end:
ret <6 x float> %phi
}
-define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) {
+define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v12f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4187,7 +4187,7 @@ end:
ret <12 x half> %phi
}
-define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f32_to_v12f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4357,7 +4357,7 @@ end:
ret <12 x half> %phi
}
-define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) {
+define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v6f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4555,7 +4555,7 @@ end:
ret <6 x float> %phi
}
-define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v6f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4770,7 +4770,7 @@ end:
ret <6 x float> %phi
}
-define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) {
+define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v3f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4862,7 +4862,7 @@ end:
ret <3 x double> %phi
}
-define inreg <3 x double> @bitcast_v3i64_to_v3f64_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v3i64_to_v3f64_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v3f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4978,7 +4978,7 @@ end:
ret <3 x double> %phi
}
-define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) {
+define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v3i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5059,7 +5059,7 @@ end:
ret <3 x i64> %phi
}
-define inreg <3 x i64> @bitcast_v3f64_to_v3i64_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v3f64_to_v3i64_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v3i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5169,7 +5169,7 @@ end:
ret <3 x i64> %phi
}
-define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) {
+define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v12i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5300,7 +5300,7 @@ end:
ret <12 x i16> %phi
}
-define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v12i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5453,7 +5453,7 @@ end:
ret <12 x i16> %phi
}
-define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) {
+define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v3i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5627,7 +5627,7 @@ end:
ret <3 x i64> %phi
}
-define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v3i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5820,7 +5820,7 @@ end:
ret <3 x i64> %phi
}
-define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) {
+define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v12f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5951,7 +5951,7 @@ end:
ret <12 x half> %phi
}
-define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i64_to_v12f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6104,7 +6104,7 @@ end:
ret <12 x half> %phi
}
-define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) {
+define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v3i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6302,7 +6302,7 @@ end:
ret <3 x i64> %phi
}
-define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v3i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6517,7 +6517,7 @@ end:
ret <3 x i64> %phi
}
-define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) {
+define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v12i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6637,7 +6637,7 @@ end:
ret <12 x i16> %phi
}
-define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v12i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6795,7 +6795,7 @@ end:
ret <12 x i16> %phi
}
-define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) {
+define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v3f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6969,7 +6969,7 @@ end:
ret <3 x double> %phi
}
-define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v3f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7162,7 +7162,7 @@ end:
ret <3 x double> %phi
}
-define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) {
+define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v12f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7282,7 +7282,7 @@ end:
ret <12 x half> %phi
}
-define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f64_to_v12f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7440,7 +7440,7 @@ end:
ret <12 x half> %phi
}
-define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) {
+define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v3f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7638,7 +7638,7 @@ end:
ret <3 x double> %phi
}
-define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v3f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7853,7 +7853,7 @@ end:
ret <3 x double> %phi
}
-define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) {
+define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v12f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8052,7 +8052,7 @@ end:
ret <12 x half> %phi
}
-define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i16_to_v12f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8283,7 +8283,7 @@ end:
ret <12 x half> %phi
}
-define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) {
+define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v12i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8458,7 +8458,7 @@ end:
ret <12 x i16> %phi
}
-define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f16_to_v12i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8686,6 +8686,9 @@ end:
%phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <12 x i16> %phi
}
+
+attributes #0 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-FAKE16: {{.*}}
; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
index e0fac42ac9d77..c66f5ea8e6c60 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) {
+define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7i32_to_v7f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -101,7 +101,7 @@ end:
ret <7 x float> %phi
}
-define inreg <7 x float> @bitcast_v7i32_to_v7f32_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v7i32_to_v7f32_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7i32_to_v7f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -226,7 +226,7 @@ end:
ret <7 x float> %phi
}
-define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) {
+define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7f32_to_v7i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -317,7 +317,7 @@ end:
ret <7 x i32> %phi
}
-define inreg <7 x i32> @bitcast_v7f32_to_v7i32_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v7f32_to_v7i32_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7f32_to_v7i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -448,7 +448,7 @@ end:
ret <7 x i32> %phi
}
-define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) {
+define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7i32_to_v14i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -588,7 +588,7 @@ end:
ret <14 x i16> %phi
}
-define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7i32_to_v14i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -755,7 +755,7 @@ end:
ret <14 x i16> %phi
}
-define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) {
+define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i16_to_v7i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -946,7 +946,7 @@ end:
ret <7 x i32> %phi
}
-define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i16_to_v7i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1160,7 +1160,7 @@ end:
ret <7 x i32> %phi
}
-define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) {
+define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7i32_to_v14f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1300,7 +1300,7 @@ end:
ret <14 x half> %phi
}
-define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7i32_to_v14f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1467,7 +1467,7 @@ end:
ret <14 x half> %phi
}
-define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) {
+define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f16_to_v7i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1686,7 +1686,7 @@ end:
ret <7 x i32> %phi
}
-define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f16_to_v7i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1925,7 +1925,7 @@ end:
ret <7 x i32> %phi
}
-define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) {
+define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7f32_to_v14i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2061,7 +2061,7 @@ end:
ret <14 x i16> %phi
}
-define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7f32_to_v14i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2244,7 +2244,7 @@ end:
ret <14 x i16> %phi
}
-define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) {
+define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i16_to_v7f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2435,7 +2435,7 @@ end:
ret <7 x float> %phi
}
-define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i16_to_v7f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2649,7 +2649,7 @@ end:
ret <7 x float> %phi
}
-define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) {
+define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7f32_to_v14f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2785,7 +2785,7 @@ end:
ret <14 x half> %phi
}
-define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7f32_to_v14f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2968,7 +2968,7 @@ end:
ret <14 x half> %phi
}
-define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) {
+define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f16_to_v7f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3187,7 +3187,7 @@ end:
ret <7 x float> %phi
}
-define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f16_to_v7f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3426,7 +3426,7 @@ end:
ret <7 x float> %phi
}
-define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) {
+define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i16_to_v14f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3646,7 +3646,7 @@ end:
ret <14 x half> %phi
}
-define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i16_to_v14f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3901,7 +3901,7 @@ end:
ret <14 x half> %phi
}
-define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) {
+define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f16_to_v14i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4094,7 +4094,7 @@ end:
ret <14 x i16> %phi
}
-define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f16_to_v14i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4344,6 +4344,9 @@ end:
%phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <14 x i16> %phi
}
+
+attributes #0 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-FAKE16: {{.*}}
; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 4437990317bd6..417400664c162 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) {
+define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v8f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -105,7 +105,7 @@ end:
ret <8 x float> %phi
}
-define inreg <8 x float> @bitcast_v8i32_to_v8f32_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v8i32_to_v8f32_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v8f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -237,7 +237,7 @@ end:
ret <8 x float> %phi
}
-define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) {
+define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -331,7 +331,7 @@ end:
ret <8 x i32> %phi
}
-define inreg <8 x i32> @bitcast_v8f32_to_v8i32_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v8f32_to_v8i32_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v8i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -470,7 +470,7 @@ end:
ret <8 x i32> %phi
}
-define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) {
+define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -569,7 +569,7 @@ end:
ret <4 x i64> %phi
}
-define inreg <4 x i64> @bitcast_v8i32_to_v4i64_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v8i32_to_v4i64_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v4i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -701,7 +701,7 @@ end:
ret <4 x i64> %phi
}
-define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) {
+define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -802,7 +802,7 @@ end:
ret <8 x i32> %phi
}
-define inreg <8 x i32> @bitcast_v4i64_to_v8i32_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v4i64_to_v8i32_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v8i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -934,7 +934,7 @@ end:
ret <8 x i32> %phi
}
-define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) {
+define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1033,7 +1033,7 @@ end:
ret <4 x double> %phi
}
-define inreg <4 x double> @bitcast_v8i32_to_v4f64_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v8i32_to_v4f64_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v4f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1165,7 +1165,7 @@ end:
ret <4 x double> %phi
}
-define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) {
+define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1250,7 +1250,7 @@ end:
ret <8 x i32> %phi
}
-define inreg <8 x i32> @bitcast_v4f64_to_v8i32_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v4f64_to_v8i32_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v8i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1373,7 +1373,7 @@ end:
ret <8 x i32> %phi
}
-define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) {
+define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1523,7 +1523,7 @@ end:
ret <16 x i16> %phi
}
-define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v16i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1703,7 +1703,7 @@ end:
ret <16 x i16> %phi
}
-define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) {
+define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1910,7 +1910,7 @@ end:
ret <8 x i32> %phi
}
-define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v8i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2144,7 +2144,7 @@ end:
ret <8 x i32> %phi
}
-define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) {
+define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2294,7 +2294,7 @@ end:
ret <16 x half> %phi
}
-define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v16f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2474,7 +2474,7 @@ end:
ret <16 x half> %phi
}
-define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) {
+define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2713,7 +2713,7 @@ end:
ret <8 x i32> %phi
}
-define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v8i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2975,7 +2975,7 @@ end:
ret <8 x i32> %phi
}
-define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v16bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3166,7 +3166,7 @@ end:
ret <16 x bfloat> %phi
}
-define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v16bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3370,7 +3370,7 @@ end:
ret <16 x bfloat> %phi
}
-define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
+define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4114,7 +4114,7 @@ end:
ret <8 x i32> %phi
}
-define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v8i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4966,7 +4966,7 @@ end:
ret <8 x i32> %phi
}
-define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) {
+define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v32i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5511,7 +5511,7 @@ end:
ret <32 x i8> %phi
}
-define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i32_to_v32i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6015,7 +6015,7 @@ end:
ret <32 x i8> %phi
}
-define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
+define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6959,7 +6959,7 @@ end:
ret <8 x i32> %phi
}
-define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v8i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7565,7 +7565,7 @@ end:
ret <8 x i32> %phi
}
-define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) {
+define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7659,7 +7659,7 @@ end:
ret <4 x i64> %phi
}
-define inreg <4 x i64> @bitcast_v8f32_to_v4i64_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v8f32_to_v4i64_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v4i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7798,7 +7798,7 @@ end:
ret <4 x i64> %phi
}
-define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) {
+define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v8f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7899,7 +7899,7 @@ end:
ret <8 x float> %phi
}
-define inreg <8 x float> @bitcast_v4i64_to_v8f32_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v4i64_to_v8f32_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v8f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8031,7 +8031,7 @@ end:
ret <8 x float> %phi
}
-define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) {
+define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8125,7 +8125,7 @@ end:
ret <4 x double> %phi
}
-define inreg <4 x double> @bitcast_v8f32_to_v4f64_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v8f32_to_v4f64_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v4f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8264,7 +8264,7 @@ end:
ret <4 x double> %phi
}
-define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) {
+define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v8f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8349,7 +8349,7 @@ end:
ret <8 x float> %phi
}
-define inreg <8 x float> @bitcast_v4f64_to_v8f32_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v4f64_to_v8f32_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v8f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8472,7 +8472,7 @@ end:
ret <8 x float> %phi
}
-define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) {
+define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8617,7 +8617,7 @@ end:
ret <16 x i16> %phi
}
-define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v16i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8813,7 +8813,7 @@ end:
ret <16 x i16> %phi
}
-define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) {
+define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v8f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9020,7 +9020,7 @@ end:
ret <8 x float> %phi
}
-define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v8f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9254,7 +9254,7 @@ end:
ret <8 x float> %phi
}
-define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) {
+define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9399,7 +9399,7 @@ end:
ret <16 x half> %phi
}
-define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v16f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9595,7 +9595,7 @@ end:
ret <16 x half> %phi
}
-define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) {
+define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v8f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9834,7 +9834,7 @@ end:
ret <8 x float> %phi
}
-define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v8f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10096,7 +10096,7 @@ end:
ret <8 x float> %phi
}
-define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v16bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10282,7 +10282,7 @@ end:
ret <16 x bfloat> %phi
}
-define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v16bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10510,7 +10510,7 @@ end:
ret <16 x bfloat> %phi
}
-define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
+define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v8f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11254,7 +11254,7 @@ end:
ret <8 x float> %phi
}
-define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v8f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12106,7 +12106,7 @@ end:
ret <8 x float> %phi
}
-define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) {
+define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v32i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12647,7 +12647,7 @@ end:
ret <32 x i8> %phi
}
-define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f32_to_v32i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13198,7 +13198,7 @@ end:
ret <32 x i8> %phi
}
-define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
+define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v8f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14142,7 +14142,7 @@ end:
ret <8 x float> %phi
}
-define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v8f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14748,7 +14748,7 @@ end:
ret <8 x float> %phi
}
-define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) {
+define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14849,7 +14849,7 @@ end:
ret <4 x double> %phi
}
-define inreg <4 x double> @bitcast_v4i64_to_v4f64_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v4i64_to_v4f64_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v4f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14980,7 +14980,7 @@ end:
ret <4 x double> %phi
}
-define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) {
+define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15065,7 +15065,7 @@ end:
ret <4 x i64> %phi
}
-define inreg <4 x i64> @bitcast_v4f64_to_v4i64_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v4f64_to_v4i64_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v4i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15188,7 +15188,7 @@ end:
ret <4 x i64> %phi
}
-define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) {
+define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15340,7 +15340,7 @@ end:
ret <16 x i16> %phi
}
-define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v16i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15520,7 +15520,7 @@ end:
ret <16 x i16> %phi
}
-define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) {
+define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15727,7 +15727,7 @@ end:
ret <4 x i64> %phi
}
-define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v4i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15961,7 +15961,7 @@ end:
ret <4 x i64> %phi
}
-define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) {
+define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16113,7 +16113,7 @@ end:
ret <16 x half> %phi
}
-define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v16f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16293,7 +16293,7 @@ end:
ret <16 x half> %phi
}
-define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) {
+define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16532,7 +16532,7 @@ end:
ret <4 x i64> %phi
}
-define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v4i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16794,7 +16794,7 @@ end:
ret <4 x i64> %phi
}
-define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v16bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16987,7 +16987,7 @@ end:
ret <16 x bfloat> %phi
}
-define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v16bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17191,7 +17191,7 @@ end:
ret <16 x bfloat> %phi
}
-define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
+define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17935,7 +17935,7 @@ end:
ret <4 x i64> %phi
}
-define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v4i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18787,7 +18787,7 @@ end:
ret <4 x i64> %phi
}
-define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) {
+define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v32i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19338,7 +19338,7 @@ end:
ret <32 x i8> %phi
}
-define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i64_to_v32i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19842,7 +19842,7 @@ end:
ret <32 x i8> %phi
}
-define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
+define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20786,7 +20786,7 @@ end:
ret <4 x i64> %phi
}
-define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v4i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21392,7 +21392,7 @@ end:
ret <4 x i64> %phi
}
-define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) {
+define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21528,7 +21528,7 @@ end:
ret <16 x i16> %phi
}
-define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v16i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21708,7 +21708,7 @@ end:
ret <16 x i16> %phi
}
-define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) {
+define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21915,7 +21915,7 @@ end:
ret <4 x double> %phi
}
-define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v4f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22149,7 +22149,7 @@ end:
ret <4 x double> %phi
}
-define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) {
+define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22285,7 +22285,7 @@ end:
ret <16 x half> %phi
}
-define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v16f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22465,7 +22465,7 @@ end:
ret <16 x half> %phi
}
-define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) {
+define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22704,7 +22704,7 @@ end:
ret <4 x double> %phi
}
-define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v4f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22966,7 +22966,7 @@ end:
ret <4 x double> %phi
}
-define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v16bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23139,7 +23139,7 @@ end:
ret <16 x bfloat> %phi
}
-define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v16bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23351,7 +23351,7 @@ end:
ret <16 x bfloat> %phi
}
-define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
+define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24095,7 +24095,7 @@ end:
ret <4 x double> %phi
}
-define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v4f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24947,7 +24947,7 @@ end:
ret <4 x double> %phi
}
-define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) {
+define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v32i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25482,7 +25482,7 @@ end:
ret <32 x i8> %phi
}
-define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f64_to_v32i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26019,7 +26019,7 @@ end:
ret <32 x i8> %phi
}
-define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
+define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26963,7 +26963,7 @@ end:
ret <4 x double> %phi
}
-define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v4f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27569,7 +27569,7 @@ end:
ret <4 x double> %phi
}
-define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) {
+define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27811,7 +27811,7 @@ end:
ret <16 x half> %phi
}
-define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v16f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28092,7 +28092,7 @@ end:
ret <16 x half> %phi
}
-define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) {
+define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28303,7 +28303,7 @@ end:
ret <16 x i16> %phi
}
-define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v16i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28577,7 +28577,7 @@ end:
ret <16 x i16> %phi
}
-define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v16bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28810,7 +28810,7 @@ end:
ret <16 x bfloat> %phi
}
-define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v16bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29091,7 +29091,7 @@ end:
ret <16 x bfloat> %phi
}
-define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
+define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29872,7 +29872,7 @@ end:
ret <16 x i16> %phi
}
-define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v16i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30714,7 +30714,7 @@ end:
ret <16 x i16> %phi
}
-define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) {
+define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v32i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31362,7 +31362,7 @@ end:
ret <32 x i8> %phi
}
-define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i16_to_v32i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31981,7 +31981,7 @@ end:
ret <32 x i8> %phi
}
-define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
+define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32976,7 +32976,7 @@ end:
ret <16 x i16> %phi
}
-define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v16i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33796,7 +33796,7 @@ end:
ret <16 x i16> %phi
}
-define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v16bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34062,7 +34062,7 @@ end:
ret <16 x bfloat> %phi
}
-define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v16bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34372,7 +34372,7 @@ end:
ret <16 x bfloat> %phi
}
-define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) {
+define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35171,7 +35171,7 @@ end:
ret <16 x half> %phi
}
-define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v16f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36060,7 +36060,7 @@ end:
ret <16 x half> %phi
}
-define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) {
+define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v32i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36715,7 +36715,7 @@ end:
ret <32 x i8> %phi
}
-define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f16_to_v32i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37368,7 +37368,7 @@ end:
ret <32 x i8> %phi
}
-define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
+define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38363,7 +38363,7 @@ end:
ret <16 x half> %phi
}
-define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v16f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39183,7 +39183,7 @@ end:
ret <16 x half> %phi
}
-define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
+define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v32i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40364,7 +40364,7 @@ end:
ret <32 x i8> %phi
}
-define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16bf16_to_v32i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41698,7 +41698,7 @@ end:
ret <32 x i8> %phi
}
-define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
+define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v16bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42697,7 +42697,7 @@ end:
ret <16 x bfloat> %phi
}
-define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i8_to_v16bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43508,3 +43508,5 @@ end:
%phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <16 x bfloat> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
index 6656733d53e51..ead5d76b2e572 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) {
+define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9i32_to_v9f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -109,7 +109,7 @@ end:
ret <9 x float> %phi
}
-define inreg <9 x float> @bitcast_v9i32_to_v9f32_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v9i32_to_v9f32_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9i32_to_v9f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -249,7 +249,7 @@ end:
ret <9 x float> %phi
}
-define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) {
+define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9f32_to_v9i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -347,7 +347,7 @@ end:
ret <9 x i32> %phi
}
-define inreg <9 x i32> @bitcast_v9f32_to_v9i32_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v9f32_to_v9i32_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9f32_to_v9i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,7 +494,7 @@ end:
ret <9 x i32> %phi
}
-define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) {
+define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9i32_to_v18i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -654,7 +654,7 @@ end:
ret <18 x i16> %phi
}
-define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9i32_to_v18i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -848,7 +848,7 @@ end:
ret <18 x i16> %phi
}
-define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) {
+define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18i16_to_v9i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1071,7 +1071,7 @@ end:
ret <9 x i32> %phi
}
-define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18i16_to_v9i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1325,7 +1325,7 @@ end:
ret <9 x i32> %phi
}
-define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) {
+define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9i32_to_v18f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1485,7 +1485,7 @@ end:
ret <18 x half> %phi
}
-define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9i32_to_v18f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1679,7 +1679,7 @@ end:
ret <18 x half> %phi
}
-define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) {
+define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18f16_to_v9i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1939,7 +1939,7 @@ end:
ret <9 x i32> %phi
}
-define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18f16_to_v9i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2224,7 +2224,7 @@ end:
ret <9 x i32> %phi
}
-define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) {
+define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9f32_to_v18i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2379,7 +2379,7 @@ end:
ret <18 x i16> %phi
}
-define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9f32_to_v18i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2607,7 +2607,7 @@ end:
ret <18 x i16> %phi
}
-define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) {
+define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18i16_to_v9f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2830,7 +2830,7 @@ end:
ret <9 x float> %phi
}
-define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18i16_to_v9f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3084,7 +3084,7 @@ end:
ret <9 x float> %phi
}
-define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) {
+define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9f32_to_v18f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3239,7 +3239,7 @@ end:
ret <18 x half> %phi
}
-define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9f32_to_v18f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3467,7 +3467,7 @@ end:
ret <18 x half> %phi
}
-define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) {
+define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18f16_to_v9f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3727,7 +3727,7 @@ end:
ret <9 x float> %phi
}
-define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18f16_to_v9f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4012,7 +4012,7 @@ end:
ret <9 x float> %phi
}
-define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) {
+define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18i16_to_v18f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4274,7 +4274,7 @@ end:
ret <18 x half> %phi
}
-define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18i16_to_v18f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4590,7 +4590,7 @@ end:
ret <18 x half> %phi
}
-define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) {
+define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18f16_to_v18i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4819,7 +4819,7 @@ end:
ret <18 x i16> %phi
}
-define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18f16_to_v18i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5133,6 +5133,9 @@ end:
%phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <18 x i16> %phi
}
+
+attributes #0 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-FAKE16: {{.*}}
; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index a37808917900b..48b7257f5cb9f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) {
+define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v10f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -116,7 +116,7 @@ end:
ret <10 x float> %phi
}
-define inreg <10 x float> @bitcast_v10i32_to_v10f32_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v10i32_to_v10f32_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v10f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -263,7 +263,7 @@ end:
ret <10 x float> %phi
}
-define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) {
+define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v10i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -367,7 +367,7 @@ end:
ret <10 x i32> %phi
}
-define inreg <10 x i32> @bitcast_v10f32_to_v10i32_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v10f32_to_v10i32_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v10i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -521,7 +521,7 @@ end:
ret <10 x i32> %phi
}
-define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) {
+define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v20i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -694,7 +694,7 @@ end:
ret <20 x i16> %phi
}
-define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v20i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -901,7 +901,7 @@ end:
ret <20 x i16> %phi
}
-define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) {
+define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v10i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1141,7 +1141,7 @@ end:
ret <10 x i32> %phi
}
-define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v10i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1414,7 +1414,7 @@ end:
ret <10 x i32> %phi
}
-define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
+define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1587,7 +1587,7 @@ end:
ret <20 x half> %phi
}
-define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v20f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1794,7 +1794,7 @@ end:
ret <20 x half> %phi
}
-define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
+define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v10i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2074,7 +2074,7 @@ end:
ret <10 x i32> %phi
}
-define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v10i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2381,7 +2381,7 @@ end:
ret <10 x i32> %phi
}
-define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
+define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v40i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3251,7 +3251,7 @@ end:
ret <40 x i8> %phi
}
-define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v40i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4049,7 +4049,7 @@ end:
ret <40 x i8> %phi
}
-define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
+define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v10i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5247,7 +5247,7 @@ end:
ret <10 x i32> %phi
}
-define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v10i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5972,7 +5972,7 @@ end:
ret <10 x i32> %phi
}
-define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
+define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6082,7 +6082,7 @@ end:
ret <5 x double> %phi
}
-define inreg <5 x double> @bitcast_v10i32_to_v5f64_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v10i32_to_v5f64_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v5f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6229,7 +6229,7 @@ end:
ret <5 x double> %phi
}
-define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) {
+define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v10i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6319,7 +6319,7 @@ end:
ret <10 x i32> %phi
}
-define inreg <10 x i32> @bitcast_v5f64_to_v10i32_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v5f64_to_v10i32_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v10i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6453,7 +6453,7 @@ end:
ret <10 x i32> %phi
}
-define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
+define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6563,7 +6563,7 @@ end:
ret <5 x i64> %phi
}
-define inreg <5 x i64> @bitcast_v10i32_to_v5i64_scalar(<10 x i32> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v10i32_to_v5i64_scalar(<10 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i32_to_v5i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6710,7 +6710,7 @@ end:
ret <5 x i64> %phi
}
-define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
+define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v10i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6823,7 +6823,7 @@ end:
ret <10 x i32> %phi
}
-define inreg <10 x i32> @bitcast_v5i64_to_v10i32_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x i32> @bitcast_v5i64_to_v10i32_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v10i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6970,7 +6970,7 @@ end:
ret <10 x i32> %phi
}
-define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) {
+define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v20i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7137,7 +7137,7 @@ end:
ret <20 x i16> %phi
}
-define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v20i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7377,7 +7377,7 @@ end:
ret <20 x i16> %phi
}
-define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) {
+define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v10f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7617,7 +7617,7 @@ end:
ret <10 x float> %phi
}
-define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v10f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7890,7 +7890,7 @@ end:
ret <10 x float> %phi
}
-define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) {
+define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8057,7 +8057,7 @@ end:
ret <20 x half> %phi
}
-define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v20f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8297,7 +8297,7 @@ end:
ret <20 x half> %phi
}
-define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) {
+define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v10f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8577,7 +8577,7 @@ end:
ret <10 x float> %phi
}
-define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v10f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8884,7 +8884,7 @@ end:
ret <10 x float> %phi
}
-define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
+define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v40i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9746,7 +9746,7 @@ end:
ret <40 x i8> %phi
}
-define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v40i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10637,7 +10637,7 @@ end:
ret <40 x i8> %phi
}
-define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
+define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v10f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11835,7 +11835,7 @@ end:
ret <10 x float> %phi
}
-define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v10f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12560,7 +12560,7 @@ end:
ret <10 x float> %phi
}
-define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) {
+define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12664,7 +12664,7 @@ end:
ret <5 x double> %phi
}
-define inreg <5 x double> @bitcast_v10f32_to_v5f64_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v10f32_to_v5f64_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v5f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12839,7 +12839,7 @@ end:
ret <5 x double> %phi
}
-define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) {
+define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v10f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12929,7 +12929,7 @@ end:
ret <10 x float> %phi
}
-define inreg <10 x float> @bitcast_v5f64_to_v10f32_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v5f64_to_v10f32_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v10f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13063,7 +13063,7 @@ end:
ret <10 x float> %phi
}
-define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) {
+define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13167,7 +13167,7 @@ end:
ret <5 x i64> %phi
}
-define inreg <5 x i64> @bitcast_v10f32_to_v5i64_scalar(<10 x float> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v10f32_to_v5i64_scalar(<10 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f32_to_v5i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13342,7 +13342,7 @@ end:
ret <5 x i64> %phi
}
-define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) {
+define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v10f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13455,7 +13455,7 @@ end:
ret <10 x float> %phi
}
-define inreg <10 x float> @bitcast_v5i64_to_v10f32_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x float> @bitcast_v5i64_to_v10f32_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v10f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13602,7 +13602,7 @@ end:
ret <10 x float> %phi
}
-define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) {
+define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13887,7 +13887,7 @@ end:
ret <20 x half> %phi
}
-define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v20f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14228,7 +14228,7 @@ end:
ret <20 x half> %phi
}
-define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) {
+define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v20i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14475,7 +14475,7 @@ end:
ret <20 x i16> %phi
}
-define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v20i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14812,7 +14812,7 @@ end:
ret <20 x i16> %phi
}
-define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
+define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v40i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15847,7 +15847,7 @@ end:
ret <40 x i8> %phi
}
-define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v40i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15855,10 +15855,10 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v3, s30, 0
-; SI-NEXT: v_writelane_b32 v3, s31, 1
-; SI-NEXT: v_writelane_b32 v3, s34, 2
-; SI-NEXT: v_writelane_b32 v3, s35, 3
+; SI-NEXT: v_writelane_b32 v3, s34, 0
+; SI-NEXT: v_writelane_b32 v3, s35, 1
+; SI-NEXT: v_writelane_b32 v3, s30, 2
+; SI-NEXT: v_writelane_b32 v3, s31, 3
; SI-NEXT: s_lshr_b32 s90, s25, 16
; SI-NEXT: s_lshr_b32 s35, s24, 16
; SI-NEXT: s_lshr_b32 s91, s23, 16
@@ -16136,11 +16136,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v3, 2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s35, v3, 3
-; SI-NEXT: v_readlane_b32 s34, v3, 2
-; SI-NEXT: v_readlane_b32 s31, v3, 1
-; SI-NEXT: v_readlane_b32 s30, v3, 0
+; SI-NEXT: v_readlane_b32 s31, v3, 3
+; SI-NEXT: v_readlane_b32 s35, v3, 1
+; SI-NEXT: v_readlane_b32 s34, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -16818,7 +16818,7 @@ end:
ret <40 x i8> %phi
}
-define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
+define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v20i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18154,7 +18154,7 @@ end:
ret <20 x i16> %phi
}
-define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v20i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18162,14 +18162,15 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v27, s30, 0
-; SI-NEXT: v_writelane_b32 v27, s31, 1
-; SI-NEXT: v_writelane_b32 v27, s34, 2
-; SI-NEXT: v_writelane_b32 v27, s35, 3
-; SI-NEXT: v_writelane_b32 v27, s36, 4
-; SI-NEXT: v_writelane_b32 v27, s37, 5
+; SI-NEXT: v_writelane_b32 v27, s34, 0
+; SI-NEXT: v_writelane_b32 v27, s35, 1
+; SI-NEXT: v_writelane_b32 v27, s36, 2
+; SI-NEXT: v_writelane_b32 v27, s37, 3
+; SI-NEXT: v_writelane_b32 v27, s38, 4
+; SI-NEXT: v_writelane_b32 v27, s39, 5
+; SI-NEXT: v_writelane_b32 v27, s30, 6
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
-; SI-NEXT: v_writelane_b32 v27, s38, 6
+; SI-NEXT: v_writelane_b32 v27, s31, 7
; SI-NEXT: v_readfirstlane_b32 s91, v25
; SI-NEXT: v_readfirstlane_b32 s90, v24
; SI-NEXT: v_readfirstlane_b32 s94, v23
@@ -18197,7 +18198,6 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_readfirstlane_b32 s58, v1
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s59, v0
-; SI-NEXT: v_writelane_b32 v27, s39, 7
; SI-NEXT: s_cbranch_scc0 .LBB51_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xff
@@ -18478,6 +18478,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_lshl_b32 s14, s15, 16
; SI-NEXT: s_or_b32 s5, s5, s14
+; SI-NEXT: v_readlane_b32 s30, v27, 6
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s10
@@ -18488,14 +18489,13 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v7, s7
; SI-NEXT: v_mov_b32_e32 v8, s4
; SI-NEXT: v_mov_b32_e32 v9, s5
-; SI-NEXT: v_readlane_b32 s39, v27, 7
-; SI-NEXT: v_readlane_b32 s38, v27, 6
-; SI-NEXT: v_readlane_b32 s37, v27, 5
-; SI-NEXT: v_readlane_b32 s36, v27, 4
-; SI-NEXT: v_readlane_b32 s35, v27, 3
-; SI-NEXT: v_readlane_b32 s34, v27, 2
-; SI-NEXT: v_readlane_b32 s31, v27, 1
-; SI-NEXT: v_readlane_b32 s30, v27, 0
+; SI-NEXT: v_readlane_b32 s31, v27, 7
+; SI-NEXT: v_readlane_b32 s39, v27, 5
+; SI-NEXT: v_readlane_b32 s38, v27, 4
+; SI-NEXT: v_readlane_b32 s37, v27, 3
+; SI-NEXT: v_readlane_b32 s36, v27, 2
+; SI-NEXT: v_readlane_b32 s35, v27, 1
+; SI-NEXT: v_readlane_b32 s34, v27, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -19164,7 +19164,7 @@ end:
ret <20 x i16> %phi
}
-define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) {
+define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19404,7 +19404,7 @@ end:
ret <5 x double> %phi
}
-define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v5f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19710,7 +19710,7 @@ end:
ret <5 x double> %phi
}
-define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) {
+define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v20i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19863,7 +19863,7 @@ end:
ret <20 x i16> %phi
}
-define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v20i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20083,7 +20083,7 @@ end:
ret <20 x i16> %phi
}
-define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) {
+define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20323,7 +20323,7 @@ end:
ret <5 x i64> %phi
}
-define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i16_to_v5i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20629,7 +20629,7 @@ end:
ret <5 x i64> %phi
}
-define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
+define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v20i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20805,7 +20805,7 @@ end:
ret <20 x i16> %phi
}
-define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v20i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21012,7 +21012,7 @@ end:
ret <20 x i16> %phi
}
-define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
+define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v40i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22031,7 +22031,7 @@ end:
ret <40 x i8> %phi
}
-define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v40i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22039,10 +22039,10 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v12, s30, 0
-; SI-NEXT: v_writelane_b32 v12, s31, 1
-; SI-NEXT: v_writelane_b32 v12, s34, 2
-; SI-NEXT: v_writelane_b32 v12, s35, 3
+; SI-NEXT: v_writelane_b32 v12, s34, 0
+; SI-NEXT: v_writelane_b32 v12, s35, 1
+; SI-NEXT: v_writelane_b32 v12, s30, 2
+; SI-NEXT: v_writelane_b32 v12, s31, 3
; SI-NEXT: s_lshr_b32 s34, s25, 16
; SI-NEXT: s_lshr_b32 s35, s24, 16
; SI-NEXT: s_lshr_b32 s30, s23, 16
@@ -22400,11 +22400,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_or_b32_e32 v1, s4, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; SI-NEXT: v_readlane_b32 s30, v12, 2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s35, v12, 3
-; SI-NEXT: v_readlane_b32 s34, v12, 2
-; SI-NEXT: v_readlane_b32 s31, v12, 1
-; SI-NEXT: v_readlane_b32 s30, v12, 0
+; SI-NEXT: v_readlane_b32 s31, v12, 3
+; SI-NEXT: v_readlane_b32 s35, v12, 1
+; SI-NEXT: v_readlane_b32 s34, v12, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -23064,7 +23064,7 @@ end:
ret <40 x i8> %phi
}
-define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
+define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24400,7 +24400,7 @@ end:
ret <20 x half> %phi
}
-define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v20f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24408,14 +24408,15 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v27, s30, 0
-; SI-NEXT: v_writelane_b32 v27, s31, 1
-; SI-NEXT: v_writelane_b32 v27, s34, 2
-; SI-NEXT: v_writelane_b32 v27, s35, 3
-; SI-NEXT: v_writelane_b32 v27, s36, 4
-; SI-NEXT: v_writelane_b32 v27, s37, 5
+; SI-NEXT: v_writelane_b32 v27, s34, 0
+; SI-NEXT: v_writelane_b32 v27, s35, 1
+; SI-NEXT: v_writelane_b32 v27, s36, 2
+; SI-NEXT: v_writelane_b32 v27, s37, 3
+; SI-NEXT: v_writelane_b32 v27, s38, 4
+; SI-NEXT: v_writelane_b32 v27, s39, 5
+; SI-NEXT: v_writelane_b32 v27, s30, 6
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
-; SI-NEXT: v_writelane_b32 v27, s38, 6
+; SI-NEXT: v_writelane_b32 v27, s31, 7
; SI-NEXT: v_readfirstlane_b32 s91, v25
; SI-NEXT: v_readfirstlane_b32 s90, v24
; SI-NEXT: v_readfirstlane_b32 s94, v23
@@ -24443,7 +24444,6 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_readfirstlane_b32 s58, v1
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s59, v0
-; SI-NEXT: v_writelane_b32 v27, s39, 7
; SI-NEXT: s_cbranch_scc0 .LBB63_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xff
@@ -24724,6 +24724,7 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_lshl_b32 s14, s15, 16
; SI-NEXT: s_or_b32 s5, s5, s14
+; SI-NEXT: v_readlane_b32 s30, v27, 6
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s10
@@ -24734,14 +24735,13 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v7, s7
; SI-NEXT: v_mov_b32_e32 v8, s4
; SI-NEXT: v_mov_b32_e32 v9, s5
-; SI-NEXT: v_readlane_b32 s39, v27, 7
-; SI-NEXT: v_readlane_b32 s38, v27, 6
-; SI-NEXT: v_readlane_b32 s37, v27, 5
-; SI-NEXT: v_readlane_b32 s36, v27, 4
-; SI-NEXT: v_readlane_b32 s35, v27, 3
-; SI-NEXT: v_readlane_b32 s34, v27, 2
-; SI-NEXT: v_readlane_b32 s31, v27, 1
-; SI-NEXT: v_readlane_b32 s30, v27, 0
+; SI-NEXT: v_readlane_b32 s31, v27, 7
+; SI-NEXT: v_readlane_b32 s39, v27, 5
+; SI-NEXT: v_readlane_b32 s38, v27, 4
+; SI-NEXT: v_readlane_b32 s37, v27, 3
+; SI-NEXT: v_readlane_b32 s36, v27, 2
+; SI-NEXT: v_readlane_b32 s35, v27, 1
+; SI-NEXT: v_readlane_b32 s34, v27, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -25410,7 +25410,7 @@ end:
ret <20 x half> %phi
}
-define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
+define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25690,7 +25690,7 @@ end:
ret <5 x double> %phi
}
-define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v5f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26043,7 +26043,7 @@ end:
ret <5 x double> %phi
}
-define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
+define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26196,7 +26196,7 @@ end:
ret <20 x half> %phi
}
-define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v20f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26416,7 +26416,7 @@ end:
ret <20 x half> %phi
}
-define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
+define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26696,7 +26696,7 @@ end:
ret <5 x i64> %phi
}
-define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f16_to_v5i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27049,7 +27049,7 @@ end:
ret <5 x i64> %phi
}
-define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
+define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27225,7 +27225,7 @@ end:
ret <20 x half> %phi
}
-define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v20f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27432,7 +27432,7 @@ end:
ret <20 x half> %phi
}
-define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
+define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28684,7 +28684,7 @@ end:
ret <5 x double> %phi
}
-define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v5f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29426,7 +29426,7 @@ end:
ret <5 x double> %phi
}
-define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
+define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v40i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30271,7 +30271,7 @@ end:
ret <40 x i8> %phi
}
-define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v40i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31137,7 +31137,7 @@ end:
ret <40 x i8> %phi
}
-define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
+define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32389,7 +32389,7 @@ end:
ret <5 x i64> %phi
}
-define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i8_to_v5i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33131,7 +33131,7 @@ end:
ret <5 x i64> %phi
}
-define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
+define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v40i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34007,7 +34007,7 @@ end:
ret <40 x i8> %phi
}
-define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v40i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34805,7 +34805,7 @@ end:
ret <40 x i8> %phi
}
-define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) {
+define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34895,7 +34895,7 @@ end:
ret <5 x i64> %phi
}
-define inreg <5 x i64> @bitcast_v5f64_to_v5i64_scalar(<5 x double> inreg %a, i32 inreg %b) {
+define inreg <5 x i64> @bitcast_v5f64_to_v5i64_scalar(<5 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5f64_to_v5i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35050,7 +35050,7 @@ end:
ret <5 x i64> %phi
}
-define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
+define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35163,7 +35163,7 @@ end:
ret <5 x double> %phi
}
-define inreg <5 x double> @bitcast_v5i64_to_v5f64_scalar(<5 x i64> inreg %a, i32 inreg %b) {
+define inreg <5 x double> @bitcast_v5i64_to_v5f64_scalar(<5 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v5i64_to_v5f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35308,3 +35308,5 @@ end:
%phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <5 x double> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 0a8af1ab3e547..123d1042e27c9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define float @bitcast_i32_to_f32(i32 %a, i32 %b) {
+define float @bitcast_i32_to_f32(i32 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i32_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -76,7 +76,7 @@ end:
ret float %phi
}
-define inreg float @bitcast_i32_to_f32_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg float @bitcast_i32_to_f32_scalar(i32 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i32_to_f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -157,7 +157,7 @@ end:
ret float %phi
}
-define i32 @bitcast_f32_to_i32(float %a, i32 %b) {
+define i32 @bitcast_f32_to_i32(float %a, i32 %b) #0 {
; SI-LABEL: bitcast_f32_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,7 +227,7 @@ end:
ret i32 %phi
}
-define inreg i32 @bitcast_f32_to_i32_scalar(float inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_f32_to_i32_scalar(float inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f32_to_i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -310,7 +310,7 @@ end:
ret i32 %phi
}
-define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) {
+define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i32_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -388,7 +388,7 @@ end:
ret <2 x i16> %phi
}
-define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i32_to_v2i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -475,7 +475,7 @@ end:
ret <2 x i16> %phi
}
-define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) {
+define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i16_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -567,7 +567,7 @@ end:
ret i32 %phi
}
-define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i16_to_i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -661,7 +661,7 @@ end:
ret i32 %phi
}
-define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) {
+define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i32_to_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -739,7 +739,7 @@ end:
ret <2 x half> %phi
}
-define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i32_to_v2f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,7 +826,7 @@ end:
ret <2 x half> %phi
}
-define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) {
+define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f16_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -918,7 +918,7 @@ end:
ret i32 %phi
}
-define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f16_to_i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1019,7 +1019,7 @@ end:
ret i32 %phi
}
-define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) {
+define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i32_to_v2bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1102,7 +1102,7 @@ end:
ret <2 x bfloat> %phi
}
-define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i32_to_v2bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1192,7 +1192,7 @@ end:
ret <2 x bfloat> %phi
}
-define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
+define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1376,7 +1376,7 @@ end:
ret i32 %phi
}
-define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1571,7 +1571,7 @@ end:
ret i32 %phi
}
-define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) {
+define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i32_to_v1i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1641,7 +1641,7 @@ end:
ret <1 x i32> %phi
}
-define inreg <1 x i32> @bitcast_i32_to_v1i32_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_i32_to_v1i32_scalar(i32 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i32_to_v1i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1722,7 +1722,7 @@ end:
ret <1 x i32> %phi
}
-define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) {
+define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v1i32_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1792,7 +1792,7 @@ end:
ret i32 %phi
}
-define inreg i32 @bitcast_v1i32_to_i32_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v1i32_to_i32_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v1i32_to_i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1873,7 +1873,7 @@ end:
ret i32 %phi
}
-define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) {
+define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i32_to_v4i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2039,7 +2039,7 @@ end:
ret <4 x i8> %phi
}
-define inreg <4 x i8> @bitcast_i32_to_v4i8_scalar(i32 inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_i32_to_v4i8_scalar(i32 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i32_to_v4i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2166,7 +2166,7 @@ end:
ret <4 x i8> %phi
}
-define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
+define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i8_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2401,7 +2401,7 @@ end:
ret i32 %phi
}
-define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i8_to_i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2555,7 +2555,7 @@ end:
ret i32 %phi
}
-define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) {
+define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) #0 {
; SI-LABEL: bitcast_f32_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2633,7 +2633,7 @@ end:
ret <2 x i16> %phi
}
-define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f32_to_v2i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2724,7 +2724,7 @@ end:
ret <2 x i16> %phi
}
-define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) {
+define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i16_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2816,7 +2816,7 @@ end:
ret float %phi
}
-define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i16_to_f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2910,7 +2910,7 @@ end:
ret float %phi
}
-define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) {
+define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) #0 {
; SI-LABEL: bitcast_f32_to_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2988,7 +2988,7 @@ end:
ret <2 x half> %phi
}
-define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f32_to_v2f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3079,7 +3079,7 @@ end:
ret <2 x half> %phi
}
-define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) {
+define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f16_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3171,7 +3171,7 @@ end:
ret float %phi
}
-define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f16_to_f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3272,7 +3272,7 @@ end:
ret float %phi
}
-define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) {
+define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) #0 {
; SI-LABEL: bitcast_f32_to_v2bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3355,7 +3355,7 @@ end:
ret <2 x bfloat> %phi
}
-define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f32_to_v2bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3450,7 +3450,7 @@ end:
ret <2 x bfloat> %phi
}
-define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
+define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3634,7 +3634,7 @@ end:
ret float %phi
}
-define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3829,7 +3829,7 @@ end:
ret float %phi
}
-define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) {
+define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) #0 {
; SI-LABEL: bitcast_f32_to_v1i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3899,7 +3899,7 @@ end:
ret <1 x i32> %phi
}
-define inreg <1 x i32> @bitcast_f32_to_v1i32_scalar(float inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_f32_to_v1i32_scalar(float inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f32_to_v1i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3982,7 +3982,7 @@ end:
ret <1 x i32> %phi
}
-define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) {
+define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v1i32_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4052,7 +4052,7 @@ end:
ret float %phi
}
-define inreg float @bitcast_v1i32_to_f32_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v1i32_to_f32_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v1i32_to_f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4133,7 +4133,7 @@ end:
ret float %phi
}
-define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) {
+define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) #0 {
; SI-LABEL: bitcast_f32_to_v4i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4299,7 +4299,7 @@ end:
ret <4 x i8> %phi
}
-define inreg <4 x i8> @bitcast_f32_to_v4i8_scalar(float inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_f32_to_v4i8_scalar(float inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f32_to_v4i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4429,7 +4429,7 @@ end:
ret <4 x i8> %phi
}
-define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
+define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i8_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4664,7 +4664,7 @@ end:
ret float %phi
}
-define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i8_to_f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4818,7 +4818,7 @@ end:
ret float %phi
}
-define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) {
+define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i16_to_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4907,7 +4907,7 @@ end:
ret <2 x half> %phi
}
-define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i16_to_v2f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5005,7 +5005,7 @@ end:
ret <2 x half> %phi
}
-define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) {
+define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f16_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5091,7 +5091,7 @@ end:
ret <2 x i16> %phi
}
-define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f16_to_v2i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5193,7 +5193,7 @@ end:
ret <2 x i16> %phi
}
-define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i16_to_v2bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5274,7 +5274,7 @@ end:
ret <2 x bfloat> %phi
}
-define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i16_to_v2bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5370,7 +5370,7 @@ end:
ret <2 x bfloat> %phi
}
-define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
+define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5545,7 +5545,7 @@ end:
ret <2 x i16> %phi
}
-define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_v2i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5737,7 +5737,7 @@ end:
ret <2 x i16> %phi
}
-define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) {
+define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i16_to_v1i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5829,7 +5829,7 @@ end:
ret <1 x i32> %phi
}
-define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i16_to_v1i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5923,7 +5923,7 @@ end:
ret <1 x i32> %phi
}
-define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) {
+define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v1i32_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6001,7 +6001,7 @@ end:
ret <2 x i16> %phi
}
-define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v1i32_to_v2i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6088,7 +6088,7 @@ end:
ret <2 x i16> %phi
}
-define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) {
+define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i16_to_v4i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6269,7 +6269,7 @@ end:
ret <4 x i8> %phi
}
-define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i16_to_v4i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6410,7 +6410,7 @@ end:
ret <4 x i8> %phi
}
-define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
+define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i8_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6645,7 +6645,7 @@ end:
ret <2 x i16> %phi
}
-define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i8_to_v2i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6805,7 +6805,7 @@ end:
ret <2 x i16> %phi
}
-define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f16_to_v2bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6900,7 +6900,7 @@ end:
ret <2 x bfloat> %phi
}
-define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f16_to_v2bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7007,7 +7007,7 @@ end:
ret <2 x bfloat> %phi
}
-define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
+define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7189,7 +7189,7 @@ end:
ret <2 x half> %phi
}
-define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_v2f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7387,7 +7387,7 @@ end:
ret <2 x half> %phi
}
-define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) {
+define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f16_to_v1i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7479,7 +7479,7 @@ end:
ret <1 x i32> %phi
}
-define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f16_to_v1i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7580,7 +7580,7 @@ end:
ret <1 x i32> %phi
}
-define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) {
+define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v1i32_to_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7658,7 +7658,7 @@ end:
ret <2 x half> %phi
}
-define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v1i32_to_v2f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7745,7 +7745,7 @@ end:
ret <2 x half> %phi
}
-define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) {
+define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f16_to_v4i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7925,7 +7925,7 @@ end:
ret <4 x i8> %phi
}
-define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f16_to_v4i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8069,7 +8069,7 @@ end:
ret <4 x i8> %phi
}
-define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
+define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i8_to_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8304,7 +8304,7 @@ end:
ret <2 x half> %phi
}
-define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i8_to_v2f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8464,7 +8464,7 @@ end:
ret <2 x half> %phi
}
-define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
+define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_v1i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8648,7 +8648,7 @@ end:
ret <1 x i32> %phi
}
-define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_v1i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8843,7 +8843,7 @@ end:
ret <1 x i32> %phi
}
-define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v1i32_to_v2bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8926,7 +8926,7 @@ end:
ret <2 x bfloat> %phi
}
-define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v1i32_to_v2bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9016,7 +9016,7 @@ end:
ret <2 x bfloat> %phi
}
-define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
+define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_v4i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9267,7 +9267,7 @@ end:
ret <4 x i8> %phi
}
-define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2bf16_to_v4i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9515,7 +9515,7 @@ end:
ret <4 x i8> %phi
}
-define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
+define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i8_to_v2bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9747,7 +9747,7 @@ end:
ret <2 x bfloat> %phi
}
-define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i8_to_v2bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9903,7 +9903,7 @@ end:
ret <2 x bfloat> %phi
}
-define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) {
+define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v1i32_to_v4i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10069,7 +10069,7 @@ end:
ret <4 x i8> %phi
}
-define inreg <4 x i8> @bitcast_v1i32_to_v4i8_scalar(<1 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i8> @bitcast_v1i32_to_v4i8_scalar(<1 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v1i32_to_v4i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10196,7 +10196,7 @@ end:
ret <4 x i8> %phi
}
-define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
+define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i8_to_v1i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10431,7 +10431,7 @@ end:
ret <1 x i32> %phi
}
-define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inreg %b) {
+define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i8_to_v1i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10584,3 +10584,5 @@ end:
%phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <1 x i32> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
index 70ed2ca42b706..79c9fc7faf339 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) {
+define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11i32_to_v11f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,7 +120,7 @@ end:
ret <11 x float> %phi
}
-define inreg <11 x float> @bitcast_v11i32_to_v11f32_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v11i32_to_v11f32_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11i32_to_v11f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -275,7 +275,7 @@ end:
ret <11 x float> %phi
}
-define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) {
+define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11f32_to_v11i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -383,7 +383,7 @@ end:
ret <11 x i32> %phi
}
-define inreg <11 x i32> @bitcast_v11f32_to_v11i32_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v11f32_to_v11i32_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11f32_to_v11i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -545,7 +545,7 @@ end:
ret <11 x i32> %phi
}
-define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) {
+define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11i32_to_v22i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -728,7 +728,7 @@ end:
ret <22 x i16> %phi
}
-define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11i32_to_v22i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -949,7 +949,7 @@ end:
ret <22 x i16> %phi
}
-define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) {
+define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22i16_to_v11i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1205,7 +1205,7 @@ end:
ret <11 x i32> %phi
}
-define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22i16_to_v11i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1498,7 +1498,7 @@ end:
ret <11 x i32> %phi
}
-define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) {
+define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11i32_to_v22f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1681,7 +1681,7 @@ end:
ret <22 x half> %phi
}
-define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11i32_to_v22f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1902,7 +1902,7 @@ end:
ret <22 x half> %phi
}
-define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) {
+define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22f16_to_v11i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2202,7 +2202,7 @@ end:
ret <11 x i32> %phi
}
-define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22f16_to_v11i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2532,7 +2532,7 @@ end:
ret <11 x i32> %phi
}
-define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) {
+define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11f32_to_v22i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2709,7 +2709,7 @@ end:
ret <22 x i16> %phi
}
-define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11f32_to_v22i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2961,7 +2961,7 @@ end:
ret <22 x i16> %phi
}
-define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) {
+define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22i16_to_v11f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3217,7 +3217,7 @@ end:
ret <11 x float> %phi
}
-define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22i16_to_v11f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3510,7 +3510,7 @@ end:
ret <11 x float> %phi
}
-define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) {
+define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11f32_to_v22f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3687,7 +3687,7 @@ end:
ret <22 x half> %phi
}
-define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11f32_to_v22f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3939,7 +3939,7 @@ end:
ret <22 x half> %phi
}
-define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) {
+define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22f16_to_v11f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4239,7 +4239,7 @@ end:
ret <11 x float> %phi
}
-define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22f16_to_v11f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4569,7 +4569,7 @@ end:
ret <11 x float> %phi
}
-define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) {
+define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22i16_to_v22f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4874,7 +4874,7 @@ end:
ret <22 x half> %phi
}
-define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22i16_to_v22f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5238,7 +5238,7 @@ end:
ret <22 x half> %phi
}
-define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) {
+define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22f16_to_v22i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5502,7 +5502,7 @@ end:
ret <22 x i16> %phi
}
-define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22f16_to_v22i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5859,6 +5859,9 @@ end:
%phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <22 x i16> %phi
}
+
+attributes #0 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-FAKE16: {{.*}}
; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
index 60c5431f7e4c6..cb0e72323a165 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) {
+define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v12f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -124,7 +124,7 @@ end:
ret <12 x float> %phi
}
-define inreg <12 x float> @bitcast_v12i32_to_v12f32_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v12i32_to_v12f32_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v12f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -286,7 +286,7 @@ end:
ret <12 x float> %phi
}
-define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) {
+define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v12i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -397,7 +397,7 @@ end:
ret <12 x i32> %phi
}
-define inreg <12 x i32> @bitcast_v12f32_to_v12i32_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v12f32_to_v12i32_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v12i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -566,7 +566,7 @@ end:
ret <12 x i32> %phi
}
-define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) {
+define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v6f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -684,7 +684,7 @@ end:
ret <6 x double> %phi
}
-define inreg <6 x double> @bitcast_v12i32_to_v6f64_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v12i32_to_v6f64_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v6f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -846,7 +846,7 @@ end:
ret <6 x double> %phi
}
-define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) {
+define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v12i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -940,7 +940,7 @@ end:
ret <12 x i32> %phi
}
-define inreg <12 x i32> @bitcast_v6f64_to_v12i32_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v6f64_to_v12i32_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v12i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1085,7 +1085,7 @@ end:
ret <12 x i32> %phi
}
-define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) {
+define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v6i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1203,7 +1203,7 @@ end:
ret <6 x i64> %phi
}
-define inreg <6 x i64> @bitcast_v12i32_to_v6i64_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v12i32_to_v6i64_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v6i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1365,7 +1365,7 @@ end:
ret <6 x i64> %phi
}
-define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) {
+define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v12i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1486,7 +1486,7 @@ end:
ret <12 x i32> %phi
}
-define inreg <12 x i32> @bitcast_v6i64_to_v12i32_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v6i64_to_v12i32_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v12i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1648,7 +1648,7 @@ end:
ret <12 x i32> %phi
}
-define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) {
+define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v24i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1841,7 +1841,7 @@ end:
ret <24 x i16> %phi
}
-define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v24i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2075,7 +2075,7 @@ end:
ret <24 x i16> %phi
}
-define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) {
+define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v12i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2347,7 +2347,7 @@ end:
ret <12 x i32> %phi
}
-define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v12i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2659,7 +2659,7 @@ end:
ret <12 x i32> %phi
}
-define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) {
+define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v24f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2852,7 +2852,7 @@ end:
ret <24 x half> %phi
}
-define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i32_to_v24f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3086,7 +3086,7 @@ end:
ret <24 x half> %phi
}
-define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) {
+define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v12i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3406,7 +3406,7 @@ end:
ret <12 x i32> %phi
}
-define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v12i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3758,7 +3758,7 @@ end:
ret <12 x i32> %phi
}
-define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) {
+define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v6f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3869,7 +3869,7 @@ end:
ret <6 x double> %phi
}
-define inreg <6 x double> @bitcast_v12f32_to_v6f64_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v12f32_to_v6f64_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v6f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4052,7 +4052,7 @@ end:
ret <6 x double> %phi
}
-define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) {
+define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v12f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4146,7 +4146,7 @@ end:
ret <12 x float> %phi
}
-define inreg <12 x float> @bitcast_v6f64_to_v12f32_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v6f64_to_v12f32_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v12f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4291,7 +4291,7 @@ end:
ret <12 x float> %phi
}
-define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) {
+define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v6i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4402,7 +4402,7 @@ end:
ret <6 x i64> %phi
}
-define inreg <6 x i64> @bitcast_v12f32_to_v6i64_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v12f32_to_v6i64_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v6i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4585,7 +4585,7 @@ end:
ret <6 x i64> %phi
}
-define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) {
+define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v12f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4706,7 +4706,7 @@ end:
ret <12 x float> %phi
}
-define inreg <12 x float> @bitcast_v6i64_to_v12f32_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v6i64_to_v12f32_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v12f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4868,7 +4868,7 @@ end:
ret <12 x float> %phi
}
-define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) {
+define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v24i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5054,7 +5054,7 @@ end:
ret <24 x i16> %phi
}
-define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v24i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5318,7 +5318,7 @@ end:
ret <24 x i16> %phi
}
-define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) {
+define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v12f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5590,7 +5590,7 @@ end:
ret <12 x float> %phi
}
-define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v12f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5902,7 +5902,7 @@ end:
ret <12 x float> %phi
}
-define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) {
+define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v24f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6088,7 +6088,7 @@ end:
ret <24 x half> %phi
}
-define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f32_to_v24f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6352,7 +6352,7 @@ end:
ret <24 x half> %phi
}
-define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) {
+define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v12f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6672,7 +6672,7 @@ end:
ret <12 x float> %phi
}
-define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v12f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7024,7 +7024,7 @@ end:
ret <12 x float> %phi
}
-define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) {
+define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v6i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7118,7 +7118,7 @@ end:
ret <6 x i64> %phi
}
-define inreg <6 x i64> @bitcast_v6f64_to_v6i64_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v6f64_to_v6i64_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v6i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7277,7 +7277,7 @@ end:
ret <6 x i64> %phi
}
-define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) {
+define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v6f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7398,7 +7398,7 @@ end:
ret <6 x double> %phi
}
-define inreg <6 x double> @bitcast_v6i64_to_v6f64_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v6i64_to_v6f64_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v6f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7559,7 +7559,7 @@ end:
ret <6 x double> %phi
}
-define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) {
+define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v24i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7728,7 +7728,7 @@ end:
ret <24 x i16> %phi
}
-define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v24i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7968,7 +7968,7 @@ end:
ret <24 x i16> %phi
}
-define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) {
+define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v6f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8240,7 +8240,7 @@ end:
ret <6 x double> %phi
}
-define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v6f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8582,7 +8582,7 @@ end:
ret <6 x double> %phi
}
-define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) {
+define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v24f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8751,7 +8751,7 @@ end:
ret <24 x half> %phi
}
-define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f64_to_v24f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8991,7 +8991,7 @@ end:
ret <24 x half> %phi
}
-define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) {
+define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v6f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9311,7 +9311,7 @@ end:
ret <6 x double> %phi
}
-define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v6f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9702,7 +9702,7 @@ end:
ret <6 x double> %phi
}
-define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) {
+define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v24i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9898,7 +9898,7 @@ end:
ret <24 x i16> %phi
}
-define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v24i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10132,7 +10132,7 @@ end:
ret <24 x i16> %phi
}
-define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) {
+define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v6i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10404,7 +10404,7 @@ end:
ret <6 x i64> %phi
}
-define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v6i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10746,7 +10746,7 @@ end:
ret <6 x i64> %phi
}
-define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) {
+define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v24f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10942,7 +10942,7 @@ end:
ret <24 x half> %phi
}
-define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i64_to_v24f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11176,7 +11176,7 @@ end:
ret <24 x half> %phi
}
-define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) {
+define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v6i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11496,7 +11496,7 @@ end:
ret <6 x i64> %phi
}
-define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v6i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11887,7 +11887,7 @@ end:
ret <6 x i64> %phi
}
-define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) {
+define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v24f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12214,7 +12214,7 @@ end:
ret <24 x half> %phi
}
-define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i16_to_v24f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12603,7 +12603,7 @@ end:
ret <24 x half> %phi
}
-define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) {
+define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v24i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12885,7 +12885,7 @@ end:
ret <24 x i16> %phi
}
-define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f16_to_v24i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13264,6 +13264,9 @@ end:
%phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <24 x i16> %phi
}
+
+attributes #0 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-FAKE16: {{.*}}
; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
index 8e5490d7eeafc..2f1dc7a1c6992 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <14 x float> @bitcast_v14i32_to_v14f32(<14 x i32> %a, i32 %b) {
+define <14 x float> @bitcast_v14i32_to_v14f32(<14 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v14f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -132,7 +132,7 @@ end:
ret <14 x float> %phi
}
-define inreg <14 x float> @bitcast_v14i32_to_v14f32_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v14i32_to_v14f32_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v14f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -312,7 +312,7 @@ end:
ret <14 x float> %phi
}
-define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) {
+define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v14i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -431,7 +431,7 @@ end:
ret <14 x i32> %phi
}
-define inreg <14 x i32> @bitcast_v14f32_to_v14i32_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v14f32_to_v14i32_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v14i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -625,7 +625,7 @@ end:
ret <14 x i32> %phi
}
-define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) {
+define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v7i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -751,7 +751,7 @@ end:
ret <7 x i64> %phi
}
-define inreg <7 x i64> @bitcast_v14i32_to_v7i64_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v14i32_to_v7i64_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v7i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -931,7 +931,7 @@ end:
ret <7 x i64> %phi
}
-define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) {
+define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v14i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1061,7 +1061,7 @@ end:
ret <14 x i32> %phi
}
-define inreg <14 x i32> @bitcast_v7i64_to_v14i32_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v7i64_to_v14i32_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v14i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1241,7 +1241,7 @@ end:
ret <14 x i32> %phi
}
-define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) {
+define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v7f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1367,7 +1367,7 @@ end:
ret <7 x double> %phi
}
-define inreg <7 x double> @bitcast_v14i32_to_v7f64_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v14i32_to_v7f64_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v7f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1547,7 +1547,7 @@ end:
ret <7 x double> %phi
}
-define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) {
+define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v14i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1645,7 +1645,7 @@ end:
ret <14 x i32> %phi
}
-define inreg <14 x i32> @bitcast_v7f64_to_v14i32_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v7f64_to_v14i32_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v14i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1811,7 +1811,7 @@ end:
ret <14 x i32> %phi
}
-define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) {
+define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v28i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2024,7 +2024,7 @@ end:
ret <28 x i16> %phi
}
-define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v28i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2288,7 +2288,7 @@ end:
ret <28 x i16> %phi
}
-define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) {
+define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v14i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2592,7 +2592,7 @@ end:
ret <14 x i32> %phi
}
-define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v14i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2973,7 +2973,7 @@ end:
ret <14 x i32> %phi
}
-define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) {
+define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v28f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3186,7 +3186,7 @@ end:
ret <28 x half> %phi
}
-define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i32_to_v28f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3450,7 +3450,7 @@ end:
ret <28 x half> %phi
}
-define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) {
+define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v14i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3810,7 +3810,7 @@ end:
ret <14 x i32> %phi
}
-define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v14i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4242,7 +4242,7 @@ end:
ret <14 x i32> %phi
}
-define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) {
+define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v7i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4361,7 +4361,7 @@ end:
ret <7 x i64> %phi
}
-define inreg <7 x i64> @bitcast_v14f32_to_v7i64_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v14f32_to_v7i64_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v7i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4555,7 +4555,7 @@ end:
ret <7 x i64> %phi
}
-define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) {
+define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v14f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4685,7 +4685,7 @@ end:
ret <14 x float> %phi
}
-define inreg <14 x float> @bitcast_v7i64_to_v14f32_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v7i64_to_v14f32_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v14f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4865,7 +4865,7 @@ end:
ret <14 x float> %phi
}
-define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) {
+define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v7f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4984,7 +4984,7 @@ end:
ret <7 x double> %phi
}
-define inreg <7 x double> @bitcast_v14f32_to_v7f64_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v14f32_to_v7f64_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v7f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5178,7 +5178,7 @@ end:
ret <7 x double> %phi
}
-define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) {
+define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v14f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5276,7 +5276,7 @@ end:
ret <14 x float> %phi
}
-define inreg <14 x float> @bitcast_v7f64_to_v14f32_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v7f64_to_v14f32_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v14f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5442,7 +5442,7 @@ end:
ret <14 x float> %phi
}
-define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) {
+define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v28i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5648,7 +5648,7 @@ end:
ret <28 x i16> %phi
}
-define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v28i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5939,7 +5939,7 @@ end:
ret <28 x i16> %phi
}
-define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) {
+define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v14f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6243,7 +6243,7 @@ end:
ret <14 x float> %phi
}
-define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v14f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6624,7 +6624,7 @@ end:
ret <14 x float> %phi
}
-define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) {
+define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v28f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6830,7 +6830,7 @@ end:
ret <28 x half> %phi
}
-define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f32_to_v28f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7121,7 +7121,7 @@ end:
ret <28 x half> %phi
}
-define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) {
+define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v14f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7481,7 +7481,7 @@ end:
ret <14 x float> %phi
}
-define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v14f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7913,7 +7913,7 @@ end:
ret <14 x float> %phi
}
-define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) {
+define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v7f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8043,7 +8043,7 @@ end:
ret <7 x double> %phi
}
-define inreg <7 x double> @bitcast_v7i64_to_v7f64_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v7i64_to_v7f64_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v7f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8222,7 +8222,7 @@ end:
ret <7 x double> %phi
}
-define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) {
+define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v7i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8320,7 +8320,7 @@ end:
ret <7 x i64> %phi
}
-define inreg <7 x i64> @bitcast_v7f64_to_v7i64_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v7f64_to_v7i64_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v7i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8486,7 +8486,7 @@ end:
ret <7 x i64> %phi
}
-define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) {
+define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v28i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8703,7 +8703,7 @@ end:
ret <28 x i16> %phi
}
-define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v28i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8967,7 +8967,7 @@ end:
ret <28 x i16> %phi
}
-define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) {
+define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v7i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9271,7 +9271,7 @@ end:
ret <7 x i64> %phi
}
-define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v7i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9652,7 +9652,7 @@ end:
ret <7 x i64> %phi
}
-define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) {
+define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v28f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9869,7 +9869,7 @@ end:
ret <28 x half> %phi
}
-define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7i64_to_v28f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10133,7 +10133,7 @@ end:
ret <28 x half> %phi
}
-define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) {
+define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v7i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10493,7 +10493,7 @@ end:
ret <7 x i64> %phi
}
-define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v7i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10925,7 +10925,7 @@ end:
ret <7 x i64> %phi
}
-define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) {
+define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v28i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11110,7 +11110,7 @@ end:
ret <28 x i16> %phi
}
-define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v28i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11373,7 +11373,7 @@ end:
ret <28 x i16> %phi
}
-define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) {
+define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v7f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11677,7 +11677,7 @@ end:
ret <7 x double> %phi
}
-define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v7f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12058,7 +12058,7 @@ end:
ret <7 x double> %phi
}
-define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) {
+define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v28f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12243,7 +12243,7 @@ end:
ret <28 x half> %phi
}
-define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v7f64_to_v28f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12506,7 +12506,7 @@ end:
ret <28 x half> %phi
}
-define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) {
+define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v7f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12866,7 +12866,7 @@ end:
ret <7 x double> %phi
}
-define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v7f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13298,7 +13298,7 @@ end:
ret <7 x double> %phi
}
-define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) {
+define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v28f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13688,7 +13688,7 @@ end:
ret <28 x half> %phi
}
-define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i16_to_v28f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14128,7 +14128,7 @@ end:
ret <28 x half> %phi
}
-define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) {
+define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v28i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14445,7 +14445,7 @@ end:
ret <28 x i16> %phi
}
-define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f16_to_v28i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14870,6 +14870,8 @@ end:
%phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <28 x i16> %phi
}
+attributes #0 = { nounwind }
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-FAKE16: {{.*}}
; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index 547985e7ef4e3..121aad5b96cb3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
+define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3bf16_to_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -242,7 +242,7 @@ end:
ret <3 x half> %phi
}
-define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3bf16_to_v3f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -500,7 +500,7 @@ end:
ret <3 x half> %phi
}
-define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) {
+define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f16_to_v3bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -608,7 +608,7 @@ end:
ret <3 x bfloat> %phi
}
-define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f16_to_v3bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -730,7 +730,7 @@ end:
ret <3 x bfloat> %phi
}
-define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
+define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3bf16_to_v3i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -960,7 +960,7 @@ end:
ret <3 x i16> %phi
}
-define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3bf16_to_v3i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1209,7 +1209,7 @@ end:
ret <3 x i16> %phi
}
-define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) {
+define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i16_to_v3bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1305,7 +1305,7 @@ end:
ret <3 x bfloat> %phi
}
-define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i16_to_v3bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1415,7 +1415,7 @@ end:
ret <3 x bfloat> %phi
}
-define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) {
+define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f16_to_v3i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1509,7 +1509,7 @@ end:
ret <3 x i16> %phi
}
-define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f16_to_v3i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1622,7 +1622,7 @@ end:
ret <3 x i16> %phi
}
-define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) {
+define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i16_to_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1722,7 +1722,7 @@ end:
ret <3 x half> %phi
}
-define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i16_to_v3f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1832,3 +1832,5 @@ end:
%phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <3 x half> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 5a06737d923f1..6872449b4334c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs=0 < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) {
+define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -140,7 +140,7 @@ end:
ret <16 x float> %phi
}
-define inreg <16 x float> @bitcast_v16i32_to_v16f32_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v16i32_to_v16f32_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v16f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -338,7 +338,7 @@ end:
ret <16 x float> %phi
}
-define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) {
+define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,7 +464,7 @@ end:
ret <16 x i32> %phi
}
-define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v16i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -666,7 +666,7 @@ end:
ret <16 x i32> %phi
}
-define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) {
+define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -800,7 +800,7 @@ end:
ret <8 x i64> %phi
}
-define inreg <8 x i64> @bitcast_v16i32_to_v8i64_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v16i32_to_v8i64_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v8i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -998,7 +998,7 @@ end:
ret <8 x i64> %phi
}
-define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) {
+define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1136,7 +1136,7 @@ end:
ret <16 x i32> %phi
}
-define inreg <16 x i32> @bitcast_v8i64_to_v16i32_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v8i64_to_v16i32_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v16i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1334,7 +1334,7 @@ end:
ret <16 x i32> %phi
}
-define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) {
+define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1468,7 +1468,7 @@ end:
ret <8 x double> %phi
}
-define inreg <8 x double> @bitcast_v16i32_to_v8f64_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v16i32_to_v8f64_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v8f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1666,7 +1666,7 @@ end:
ret <8 x double> %phi
}
-define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) {
+define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1768,7 +1768,7 @@ end:
ret <16 x i32> %phi
}
-define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v16i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1938,7 +1938,7 @@ end:
ret <16 x i32> %phi
}
-define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) {
+define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2171,7 +2171,7 @@ end:
ret <32 x i16> %phi
}
-define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v32i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2494,7 +2494,7 @@ end:
ret <32 x i16> %phi
}
-define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
+define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2830,7 +2830,7 @@ end:
ret <16 x i32> %phi
}
-define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v16i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3265,7 +3265,7 @@ end:
ret <16 x i32> %phi
}
-define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) {
+define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3498,7 +3498,7 @@ end:
ret <32 x half> %phi
}
-define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3821,7 +3821,7 @@ end:
ret <32 x half> %phi
}
-define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) {
+define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4221,7 +4221,7 @@ end:
ret <16 x i32> %phi
}
-define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v16i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4633,7 +4633,7 @@ end:
ret <16 x i32> %phi
}
-define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4946,7 +4946,7 @@ end:
ret <32 x bfloat> %phi
}
-define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5317,7 +5317,7 @@ end:
ret <32 x bfloat> %phi
}
-define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
+define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6723,7 +6723,7 @@ end:
ret <16 x i32> %phi
}
-define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v16i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8337,7 +8337,7 @@ end:
ret <16 x i32> %phi
}
-define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
+define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9804,7 +9804,7 @@ end:
ret <64 x i8> %phi
}
-define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16i32_to_v64i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9812,34 +9812,34 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v18, s30, 0
-; SI-NEXT: v_writelane_b32 v18, s31, 1
-; SI-NEXT: v_writelane_b32 v18, s34, 2
-; SI-NEXT: v_writelane_b32 v18, s35, 3
-; SI-NEXT: v_writelane_b32 v18, s36, 4
-; SI-NEXT: v_writelane_b32 v18, s37, 5
-; SI-NEXT: v_writelane_b32 v18, s38, 6
-; SI-NEXT: v_writelane_b32 v18, s39, 7
-; SI-NEXT: v_writelane_b32 v18, s48, 8
-; SI-NEXT: v_writelane_b32 v18, s49, 9
-; SI-NEXT: v_writelane_b32 v18, s50, 10
-; SI-NEXT: v_writelane_b32 v18, s51, 11
-; SI-NEXT: v_writelane_b32 v18, s52, 12
-; SI-NEXT: v_writelane_b32 v18, s53, 13
-; SI-NEXT: v_writelane_b32 v18, s54, 14
-; SI-NEXT: v_writelane_b32 v18, s55, 15
-; SI-NEXT: v_writelane_b32 v18, s64, 16
-; SI-NEXT: v_writelane_b32 v18, s65, 17
-; SI-NEXT: v_writelane_b32 v18, s66, 18
-; SI-NEXT: v_writelane_b32 v18, s67, 19
-; SI-NEXT: v_writelane_b32 v18, s68, 20
-; SI-NEXT: v_writelane_b32 v18, s69, 21
-; SI-NEXT: v_writelane_b32 v18, s70, 22
-; SI-NEXT: v_writelane_b32 v18, s71, 23
-; SI-NEXT: v_writelane_b32 v18, s80, 24
-; SI-NEXT: v_writelane_b32 v18, s81, 25
-; SI-NEXT: v_writelane_b32 v18, s82, 26
-; SI-NEXT: v_writelane_b32 v18, s83, 27
+; SI-NEXT: v_writelane_b32 v18, s34, 0
+; SI-NEXT: v_writelane_b32 v18, s35, 1
+; SI-NEXT: v_writelane_b32 v18, s36, 2
+; SI-NEXT: v_writelane_b32 v18, s37, 3
+; SI-NEXT: v_writelane_b32 v18, s38, 4
+; SI-NEXT: v_writelane_b32 v18, s39, 5
+; SI-NEXT: v_writelane_b32 v18, s48, 6
+; SI-NEXT: v_writelane_b32 v18, s49, 7
+; SI-NEXT: v_writelane_b32 v18, s50, 8
+; SI-NEXT: v_writelane_b32 v18, s51, 9
+; SI-NEXT: v_writelane_b32 v18, s52, 10
+; SI-NEXT: v_writelane_b32 v18, s53, 11
+; SI-NEXT: v_writelane_b32 v18, s54, 12
+; SI-NEXT: v_writelane_b32 v18, s55, 13
+; SI-NEXT: v_writelane_b32 v18, s64, 14
+; SI-NEXT: v_writelane_b32 v18, s65, 15
+; SI-NEXT: v_writelane_b32 v18, s66, 16
+; SI-NEXT: v_writelane_b32 v18, s67, 17
+; SI-NEXT: v_writelane_b32 v18, s68, 18
+; SI-NEXT: v_writelane_b32 v18, s69, 19
+; SI-NEXT: v_writelane_b32 v18, s70, 20
+; SI-NEXT: v_writelane_b32 v18, s71, 21
+; SI-NEXT: v_writelane_b32 v18, s80, 22
+; SI-NEXT: v_writelane_b32 v18, s81, 23
+; SI-NEXT: v_writelane_b32 v18, s82, 24
+; SI-NEXT: v_writelane_b32 v18, s83, 25
+; SI-NEXT: v_writelane_b32 v18, s84, 26
+; SI-NEXT: v_writelane_b32 v18, s85, 27
; SI-NEXT: v_mov_b32_e32 v4, s16
; SI-NEXT: v_mov_b32_e32 v5, s17
; SI-NEXT: v_mov_b32_e32 v6, s18
@@ -9855,7 +9855,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v16, s28
; SI-NEXT: v_mov_b32_e32 v17, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_writelane_b32 v18, s84, 28
+; SI-NEXT: v_writelane_b32 v18, s30, 28
; SI-NEXT: v_readfirstlane_b32 s20, v4
; SI-NEXT: v_readfirstlane_b32 s21, v5
; SI-NEXT: v_readfirstlane_b32 s16, v6
@@ -9873,7 +9873,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; SI-NEXT: v_readfirstlane_b32 s4, v1
; SI-NEXT: s_and_b64 s[18:19], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v2
-; SI-NEXT: v_writelane_b32 v18, s85, 29
+; SI-NEXT: v_writelane_b32 v18, s31, 29
; SI-NEXT: s_cbranch_scc0 .LBB25_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 24
@@ -10194,37 +10194,37 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v18, 28
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s85, v18, 29
-; SI-NEXT: v_readlane_b32 s84, v18, 28
-; SI-NEXT: v_readlane_b32 s83, v18, 27
-; SI-NEXT: v_readlane_b32 s82, v18, 26
-; SI-NEXT: v_readlane_b32 s81, v18, 25
-; SI-NEXT: v_readlane_b32 s80, v18, 24
-; SI-NEXT: v_readlane_b32 s71, v18, 23
-; SI-NEXT: v_readlane_b32 s70, v18, 22
-; SI-NEXT: v_readlane_b32 s69, v18, 21
-; SI-NEXT: v_readlane_b32 s68, v18, 20
-; SI-NEXT: v_readlane_b32 s67, v18, 19
-; SI-NEXT: v_readlane_b32 s66, v18, 18
-; SI-NEXT: v_readlane_b32 s65, v18, 17
-; SI-NEXT: v_readlane_b32 s64, v18, 16
-; SI-NEXT: v_readlane_b32 s55, v18, 15
-; SI-NEXT: v_readlane_b32 s54, v18, 14
-; SI-NEXT: v_readlane_b32 s53, v18, 13
-; SI-NEXT: v_readlane_b32 s52, v18, 12
-; SI-NEXT: v_readlane_b32 s51, v18, 11
-; SI-NEXT: v_readlane_b32 s50, v18, 10
-; SI-NEXT: v_readlane_b32 s49, v18, 9
-; SI-NEXT: v_readlane_b32 s48, v18, 8
-; SI-NEXT: v_readlane_b32 s39, v18, 7
-; SI-NEXT: v_readlane_b32 s38, v18, 6
-; SI-NEXT: v_readlane_b32 s37, v18, 5
-; SI-NEXT: v_readlane_b32 s36, v18, 4
-; SI-NEXT: v_readlane_b32 s35, v18, 3
-; SI-NEXT: v_readlane_b32 s34, v18, 2
-; SI-NEXT: v_readlane_b32 s31, v18, 1
-; SI-NEXT: v_readlane_b32 s30, v18, 0
+; SI-NEXT: v_readlane_b32 s31, v18, 29
+; SI-NEXT: v_readlane_b32 s85, v18, 27
+; SI-NEXT: v_readlane_b32 s84, v18, 26
+; SI-NEXT: v_readlane_b32 s83, v18, 25
+; SI-NEXT: v_readlane_b32 s82, v18, 24
+; SI-NEXT: v_readlane_b32 s81, v18, 23
+; SI-NEXT: v_readlane_b32 s80, v18, 22
+; SI-NEXT: v_readlane_b32 s71, v18, 21
+; SI-NEXT: v_readlane_b32 s70, v18, 20
+; SI-NEXT: v_readlane_b32 s69, v18, 19
+; SI-NEXT: v_readlane_b32 s68, v18, 18
+; SI-NEXT: v_readlane_b32 s67, v18, 17
+; SI-NEXT: v_readlane_b32 s66, v18, 16
+; SI-NEXT: v_readlane_b32 s65, v18, 15
+; SI-NEXT: v_readlane_b32 s64, v18, 14
+; SI-NEXT: v_readlane_b32 s55, v18, 13
+; SI-NEXT: v_readlane_b32 s54, v18, 12
+; SI-NEXT: v_readlane_b32 s53, v18, 11
+; SI-NEXT: v_readlane_b32 s52, v18, 10
+; SI-NEXT: v_readlane_b32 s51, v18, 9
+; SI-NEXT: v_readlane_b32 s50, v18, 8
+; SI-NEXT: v_readlane_b32 s49, v18, 7
+; SI-NEXT: v_readlane_b32 s48, v18, 6
+; SI-NEXT: v_readlane_b32 s39, v18, 5
+; SI-NEXT: v_readlane_b32 s38, v18, 4
+; SI-NEXT: v_readlane_b32 s37, v18, 3
+; SI-NEXT: v_readlane_b32 s36, v18, 2
+; SI-NEXT: v_readlane_b32 s35, v18, 1
+; SI-NEXT: v_readlane_b32 s34, v18, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -10287,24 +10287,24 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v18, s30, 0
-; VI-NEXT: v_writelane_b32 v18, s31, 1
-; VI-NEXT: v_writelane_b32 v18, s34, 2
-; VI-NEXT: v_writelane_b32 v18, s35, 3
-; VI-NEXT: v_writelane_b32 v18, s36, 4
-; VI-NEXT: v_writelane_b32 v18, s37, 5
-; VI-NEXT: v_writelane_b32 v18, s38, 6
-; VI-NEXT: v_writelane_b32 v18, s39, 7
-; VI-NEXT: v_writelane_b32 v18, s48, 8
-; VI-NEXT: v_writelane_b32 v18, s49, 9
-; VI-NEXT: v_writelane_b32 v18, s50, 10
-; VI-NEXT: v_writelane_b32 v18, s51, 11
-; VI-NEXT: v_writelane_b32 v18, s52, 12
-; VI-NEXT: v_writelane_b32 v18, s53, 13
-; VI-NEXT: v_writelane_b32 v18, s54, 14
-; VI-NEXT: v_writelane_b32 v18, s55, 15
-; VI-NEXT: v_writelane_b32 v18, s64, 16
-; VI-NEXT: v_writelane_b32 v18, s65, 17
+; VI-NEXT: v_writelane_b32 v18, s34, 0
+; VI-NEXT: v_writelane_b32 v18, s35, 1
+; VI-NEXT: v_writelane_b32 v18, s36, 2
+; VI-NEXT: v_writelane_b32 v18, s37, 3
+; VI-NEXT: v_writelane_b32 v18, s38, 4
+; VI-NEXT: v_writelane_b32 v18, s39, 5
+; VI-NEXT: v_writelane_b32 v18, s48, 6
+; VI-NEXT: v_writelane_b32 v18, s49, 7
+; VI-NEXT: v_writelane_b32 v18, s50, 8
+; VI-NEXT: v_writelane_b32 v18, s51, 9
+; VI-NEXT: v_writelane_b32 v18, s52, 10
+; VI-NEXT: v_writelane_b32 v18, s53, 11
+; VI-NEXT: v_writelane_b32 v18, s54, 12
+; VI-NEXT: v_writelane_b32 v18, s55, 13
+; VI-NEXT: v_writelane_b32 v18, s64, 14
+; VI-NEXT: v_writelane_b32 v18, s65, 15
+; VI-NEXT: v_writelane_b32 v18, s66, 16
+; VI-NEXT: v_writelane_b32 v18, s67, 17
; VI-NEXT: v_mov_b32_e32 v4, s16
; VI-NEXT: v_mov_b32_e32 v5, s17
; VI-NEXT: v_mov_b32_e32 v6, s18
@@ -10320,7 +10320,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; VI-NEXT: v_mov_b32_e32 v16, s28
; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v18, s66, 18
+; VI-NEXT: v_writelane_b32 v18, s30, 18
; VI-NEXT: v_readfirstlane_b32 s18, v4
; VI-NEXT: v_readfirstlane_b32 s19, v5
; VI-NEXT: v_readfirstlane_b32 s16, v6
@@ -10338,7 +10338,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; VI-NEXT: v_readfirstlane_b32 s4, v1
; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
-; VI-NEXT: v_writelane_b32 v18, s67, 19
+; VI-NEXT: v_writelane_b32 v18, s31, 19
; VI-NEXT: s_cbranch_scc0 .LBB25_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -10583,27 +10583,27 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT: v_readlane_b32 s30, v18, 18
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s67, v18, 19
-; VI-NEXT: v_readlane_b32 s66, v18, 18
-; VI-NEXT: v_readlane_b32 s65, v18, 17
-; VI-NEXT: v_readlane_b32 s64, v18, 16
-; VI-NEXT: v_readlane_b32 s55, v18, 15
-; VI-NEXT: v_readlane_b32 s54, v18, 14
-; VI-NEXT: v_readlane_b32 s53, v18, 13
-; VI-NEXT: v_readlane_b32 s52, v18, 12
-; VI-NEXT: v_readlane_b32 s51, v18, 11
-; VI-NEXT: v_readlane_b32 s50, v18, 10
-; VI-NEXT: v_readlane_b32 s49, v18, 9
-; VI-NEXT: v_readlane_b32 s48, v18, 8
-; VI-NEXT: v_readlane_b32 s39, v18, 7
-; VI-NEXT: v_readlane_b32 s38, v18, 6
-; VI-NEXT: v_readlane_b32 s37, v18, 5
-; VI-NEXT: v_readlane_b32 s36, v18, 4
-; VI-NEXT: v_readlane_b32 s35, v18, 3
-; VI-NEXT: v_readlane_b32 s34, v18, 2
-; VI-NEXT: v_readlane_b32 s31, v18, 1
-; VI-NEXT: v_readlane_b32 s30, v18, 0
+; VI-NEXT: v_readlane_b32 s31, v18, 19
+; VI-NEXT: v_readlane_b32 s67, v18, 17
+; VI-NEXT: v_readlane_b32 s66, v18, 16
+; VI-NEXT: v_readlane_b32 s65, v18, 15
+; VI-NEXT: v_readlane_b32 s64, v18, 14
+; VI-NEXT: v_readlane_b32 s55, v18, 13
+; VI-NEXT: v_readlane_b32 s54, v18, 12
+; VI-NEXT: v_readlane_b32 s53, v18, 11
+; VI-NEXT: v_readlane_b32 s52, v18, 10
+; VI-NEXT: v_readlane_b32 s51, v18, 9
+; VI-NEXT: v_readlane_b32 s50, v18, 8
+; VI-NEXT: v_readlane_b32 s49, v18, 7
+; VI-NEXT: v_readlane_b32 s48, v18, 6
+; VI-NEXT: v_readlane_b32 s39, v18, 5
+; VI-NEXT: v_readlane_b32 s38, v18, 4
+; VI-NEXT: v_readlane_b32 s37, v18, 3
+; VI-NEXT: v_readlane_b32 s36, v18, 2
+; VI-NEXT: v_readlane_b32 s35, v18, 1
+; VI-NEXT: v_readlane_b32 s34, v18, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -10666,20 +10666,20 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v18, s30, 0
-; GFX9-NEXT: v_writelane_b32 v18, s31, 1
-; GFX9-NEXT: v_writelane_b32 v18, s34, 2
-; GFX9-NEXT: v_writelane_b32 v18, s35, 3
-; GFX9-NEXT: v_writelane_b32 v18, s36, 4
-; GFX9-NEXT: v_writelane_b32 v18, s37, 5
-; GFX9-NEXT: v_writelane_b32 v18, s38, 6
-; GFX9-NEXT: v_writelane_b32 v18, s39, 7
-; GFX9-NEXT: v_writelane_b32 v18, s48, 8
-; GFX9-NEXT: v_writelane_b32 v18, s49, 9
-; GFX9-NEXT: v_writelane_b32 v18, s50, 10
-; GFX9-NEXT: v_writelane_b32 v18, s51, 11
-; GFX9-NEXT: v_writelane_b32 v18, s52, 12
-; GFX9-NEXT: v_writelane_b32 v18, s53, 13
+; GFX9-NEXT: v_writelane_b32 v18, s34, 0
+; GFX9-NEXT: v_writelane_b32 v18, s35, 1
+; GFX9-NEXT: v_writelane_b32 v18, s36, 2
+; GFX9-NEXT: v_writelane_b32 v18, s37, 3
+; GFX9-NEXT: v_writelane_b32 v18, s38, 4
+; GFX9-NEXT: v_writelane_b32 v18, s39, 5
+; GFX9-NEXT: v_writelane_b32 v18, s48, 6
+; GFX9-NEXT: v_writelane_b32 v18, s49, 7
+; GFX9-NEXT: v_writelane_b32 v18, s50, 8
+; GFX9-NEXT: v_writelane_b32 v18, s51, 9
+; GFX9-NEXT: v_writelane_b32 v18, s52, 10
+; GFX9-NEXT: v_writelane_b32 v18, s53, 11
+; GFX9-NEXT: v_writelane_b32 v18, s54, 12
+; GFX9-NEXT: v_writelane_b32 v18, s55, 13
; GFX9-NEXT: v_mov_b32_e32 v4, s16
; GFX9-NEXT: v_mov_b32_e32 v5, s17
; GFX9-NEXT: v_mov_b32_e32 v6, s18
@@ -10695,7 +10695,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9-NEXT: v_mov_b32_e32 v16, s28
; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_writelane_b32 v18, s54, 14
+; GFX9-NEXT: v_writelane_b32 v18, s30, 14
; GFX9-NEXT: v_readfirstlane_b32 s18, v4
; GFX9-NEXT: v_readfirstlane_b32 s19, v5
; GFX9-NEXT: v_readfirstlane_b32 s16, v6
@@ -10713,7 +10713,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: v_writelane_b32 v18, s55, 15
+; GFX9-NEXT: v_writelane_b32 v18, s31, 15
; GFX9-NEXT: s_cbranch_scc0 .LBB25_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -10943,23 +10943,23 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX9-NEXT: v_perm_b32 v2, s57, v3, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT: v_readlane_b32 s30, v18, 14
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: v_readlane_b32 s55, v18, 15
-; GFX9-NEXT: v_readlane_b32 s54, v18, 14
-; GFX9-NEXT: v_readlane_b32 s53, v18, 13
-; GFX9-NEXT: v_readlane_b32 s52, v18, 12
-; GFX9-NEXT: v_readlane_b32 s51, v18, 11
-; GFX9-NEXT: v_readlane_b32 s50, v18, 10
-; GFX9-NEXT: v_readlane_b32 s49, v18, 9
-; GFX9-NEXT: v_readlane_b32 s48, v18, 8
-; GFX9-NEXT: v_readlane_b32 s39, v18, 7
-; GFX9-NEXT: v_readlane_b32 s38, v18, 6
-; GFX9-NEXT: v_readlane_b32 s37, v18, 5
-; GFX9-NEXT: v_readlane_b32 s36, v18, 4
-; GFX9-NEXT: v_readlane_b32 s35, v18, 3
-; GFX9-NEXT: v_readlane_b32 s34, v18, 2
-; GFX9-NEXT: v_readlane_b32 s31, v18, 1
-; GFX9-NEXT: v_readlane_b32 s30, v18, 0
+; GFX9-NEXT: v_readlane_b32 s31, v18, 15
+; GFX9-NEXT: v_readlane_b32 s55, v18, 13
+; GFX9-NEXT: v_readlane_b32 s54, v18, 12
+; GFX9-NEXT: v_readlane_b32 s53, v18, 11
+; GFX9-NEXT: v_readlane_b32 s52, v18, 10
+; GFX9-NEXT: v_readlane_b32 s51, v18, 9
+; GFX9-NEXT: v_readlane_b32 s50, v18, 8
+; GFX9-NEXT: v_readlane_b32 s49, v18, 7
+; GFX9-NEXT: v_readlane_b32 s48, v18, 6
+; GFX9-NEXT: v_readlane_b32 s39, v18, 5
+; GFX9-NEXT: v_readlane_b32 s38, v18, 4
+; GFX9-NEXT: v_readlane_b32 s37, v18, 3
+; GFX9-NEXT: v_readlane_b32 s36, v18, 2
+; GFX9-NEXT: v_readlane_b32 s35, v18, 1
+; GFX9-NEXT: v_readlane_b32 s34, v18, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -11022,17 +11022,17 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX11-NEXT: s_xor_saveexec_b32 s4, -1
; GFX11-NEXT: scratch_store_b32 off, v23, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v23, s30, 0
+; GFX11-NEXT: v_writelane_b32 v23, s34, 0
; GFX11-NEXT: s_cmp_lg_u32 s28, 0
; GFX11-NEXT: s_mov_b32 vcc_lo, 0
-; GFX11-NEXT: v_writelane_b32 v23, s31, 1
-; GFX11-NEXT: v_writelane_b32 v23, s34, 2
-; GFX11-NEXT: v_writelane_b32 v23, s35, 3
-; GFX11-NEXT: v_writelane_b32 v23, s36, 4
-; GFX11-NEXT: v_writelane_b32 v23, s37, 5
-; GFX11-NEXT: v_writelane_b32 v23, s38, 6
-; GFX11-NEXT: v_writelane_b32 v23, s39, 7
-; GFX11-NEXT: v_writelane_b32 v23, s48, 8
+; GFX11-NEXT: v_writelane_b32 v23, s35, 1
+; GFX11-NEXT: v_writelane_b32 v23, s36, 2
+; GFX11-NEXT: v_writelane_b32 v23, s37, 3
+; GFX11-NEXT: v_writelane_b32 v23, s38, 4
+; GFX11-NEXT: v_writelane_b32 v23, s39, 5
+; GFX11-NEXT: v_writelane_b32 v23, s48, 6
+; GFX11-NEXT: v_writelane_b32 v23, s30, 7
+; GFX11-NEXT: v_writelane_b32 v23, s31, 8
; GFX11-NEXT: s_cbranch_scc0 .LBB25_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 24
@@ -11154,7 +11154,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX11-NEXT: v_mov_b32_e32 v12, 0xc0c0004
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_perm_b32 v5, s34, s28, v12
-; GFX11-NEXT: v_readlane_b32 s34, v23, 2
+; GFX11-NEXT: v_readlane_b32 s34, v23, 0
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_perm_b32 v2, s39, s40, v12
; GFX11-NEXT: v_perm_b32 v4, s37, s36, v12
@@ -11218,19 +11218,19 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
; GFX11-NEXT: v_or_b32_e32 v14, v19, v17
; GFX11-NEXT: v_or_b32_e32 v15, v21, v18
; GFX11-NEXT: v_or_b32_e32 v16, v22, v20
+; GFX11-NEXT: v_readlane_b32 s30, v23, 7
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT: v_readlane_b32 s48, v23, 8
-; GFX11-NEXT: v_readlane_b32 s39, v23, 7
-; GFX11-NEXT: v_readlane_b32 s38, v23, 6
-; GFX11-NEXT: v_readlane_b32 s37, v23, 5
-; GFX11-NEXT: v_readlane_b32 s36, v23, 4
-; GFX11-NEXT: v_readlane_b32 s35, v23, 3
-; GFX11-NEXT: v_readlane_b32 s31, v23, 1
-; GFX11-NEXT: v_readlane_b32 s30, v23, 0
+; GFX11-NEXT: v_readlane_b32 s31, v23, 8
+; GFX11-NEXT: v_readlane_b32 s48, v23, 6
+; GFX11-NEXT: v_readlane_b32 s39, v23, 5
+; GFX11-NEXT: v_readlane_b32 s38, v23, 4
+; GFX11-NEXT: v_readlane_b32 s37, v23, 3
+; GFX11-NEXT: v_readlane_b32 s36, v23, 2
+; GFX11-NEXT: v_readlane_b32 s35, v23, 1
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v23, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -11303,7 +11303,7 @@ end:
ret <64 x i8> %phi
}
-define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
+define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13579,7 +13579,7 @@ end:
ret <16 x i32> %phi
}
-define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v16i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15255,7 +15255,7 @@ end:
ret <16 x i32> %phi
}
-define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) {
+define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15381,7 +15381,7 @@ end:
ret <8 x i64> %phi
}
-define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v8i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15583,7 +15583,7 @@ end:
ret <8 x i64> %phi
}
-define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) {
+define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15721,7 +15721,7 @@ end:
ret <16 x float> %phi
}
-define inreg <16 x float> @bitcast_v8i64_to_v16f32_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v8i64_to_v16f32_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v16f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15919,7 +15919,7 @@ end:
ret <16 x float> %phi
}
-define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) {
+define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16045,7 +16045,7 @@ end:
ret <8 x double> %phi
}
-define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v8f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16247,7 +16247,7 @@ end:
ret <8 x double> %phi
}
-define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) {
+define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16349,7 +16349,7 @@ end:
ret <16 x float> %phi
}
-define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v16f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16519,7 +16519,7 @@ end:
ret <16 x float> %phi
}
-define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) {
+define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16744,7 +16744,7 @@ end:
ret <32 x i16> %phi
}
-define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v32i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17041,7 +17041,7 @@ end:
ret <32 x i16> %phi
}
-define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
+define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17377,7 +17377,7 @@ end:
ret <16 x float> %phi
}
-define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v16f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17812,7 +17812,7 @@ end:
ret <16 x float> %phi
}
-define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) {
+define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18037,7 +18037,7 @@ end:
ret <32 x half> %phi
}
-define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18334,7 +18334,7 @@ end:
ret <32 x half> %phi
}
-define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) {
+define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18734,7 +18734,7 @@ end:
ret <16 x float> %phi
}
-define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v16f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19146,7 +19146,7 @@ end:
ret <16 x float> %phi
}
-define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19451,7 +19451,7 @@ end:
ret <32 x bfloat> %phi
}
-define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19860,7 +19860,7 @@ end:
ret <32 x bfloat> %phi
}
-define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
+define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21266,7 +21266,7 @@ end:
ret <16 x float> %phi
}
-define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v16f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22880,7 +22880,7 @@ end:
ret <16 x float> %phi
}
-define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
+define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24331,7 +24331,7 @@ end:
ret <64 x i8> %phi
}
-define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v16f32_to_v64i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24339,34 +24339,35 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v40, s30, 0
-; SI-NEXT: v_writelane_b32 v40, s31, 1
-; SI-NEXT: v_writelane_b32 v40, s34, 2
-; SI-NEXT: v_writelane_b32 v40, s35, 3
-; SI-NEXT: v_writelane_b32 v40, s36, 4
-; SI-NEXT: v_writelane_b32 v40, s37, 5
-; SI-NEXT: v_writelane_b32 v40, s38, 6
-; SI-NEXT: v_writelane_b32 v40, s39, 7
-; SI-NEXT: v_writelane_b32 v40, s48, 8
-; SI-NEXT: v_writelane_b32 v40, s49, 9
-; SI-NEXT: v_writelane_b32 v40, s50, 10
-; SI-NEXT: v_writelane_b32 v40, s51, 11
-; SI-NEXT: v_writelane_b32 v40, s52, 12
-; SI-NEXT: v_writelane_b32 v40, s53, 13
-; SI-NEXT: v_writelane_b32 v40, s54, 14
-; SI-NEXT: v_writelane_b32 v40, s55, 15
-; SI-NEXT: v_writelane_b32 v40, s64, 16
-; SI-NEXT: v_writelane_b32 v40, s65, 17
-; SI-NEXT: v_writelane_b32 v40, s66, 18
-; SI-NEXT: v_writelane_b32 v40, s67, 19
-; SI-NEXT: v_writelane_b32 v40, s68, 20
-; SI-NEXT: v_writelane_b32 v40, s69, 21
-; SI-NEXT: v_writelane_b32 v40, s70, 22
-; SI-NEXT: v_writelane_b32 v40, s71, 23
-; SI-NEXT: v_writelane_b32 v40, s80, 24
-; SI-NEXT: v_writelane_b32 v40, s81, 25
-; SI-NEXT: v_writelane_b32 v40, s82, 26
-; SI-NEXT: v_writelane_b32 v40, s83, 27
+; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s30, 28
; SI-NEXT: v_mov_b32_e32 v4, s16
; SI-NEXT: v_mov_b32_e32 v5, s17
; SI-NEXT: v_mov_b32_e32 v6, s18
@@ -24382,7 +24383,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v16, s28
; SI-NEXT: v_mov_b32_e32 v17, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_writelane_b32 v40, s84, 28
+; SI-NEXT: v_writelane_b32 v40, s31, 29
; SI-NEXT: v_readfirstlane_b32 s36, v4
; SI-NEXT: v_readfirstlane_b32 s37, v5
; SI-NEXT: v_readfirstlane_b32 s34, v6
@@ -24400,7 +24401,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s78, v1
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s79, v2
-; SI-NEXT: v_writelane_b32 v40, s85, 29
; SI-NEXT: s_cbranch_scc0 .LBB49_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s82, s79, 24
@@ -24815,37 +24815,37 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT: v_readlane_b32 s30, v40, 28
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s85, v40, 29
-; SI-NEXT: v_readlane_b32 s84, v40, 28
-; SI-NEXT: v_readlane_b32 s83, v40, 27
-; SI-NEXT: v_readlane_b32 s82, v40, 26
-; SI-NEXT: v_readlane_b32 s81, v40, 25
-; SI-NEXT: v_readlane_b32 s80, v40, 24
-; SI-NEXT: v_readlane_b32 s71, v40, 23
-; SI-NEXT: v_readlane_b32 s70, v40, 22
-; SI-NEXT: v_readlane_b32 s69, v40, 21
-; SI-NEXT: v_readlane_b32 s68, v40, 20
-; SI-NEXT: v_readlane_b32 s67, v40, 19
-; SI-NEXT: v_readlane_b32 s66, v40, 18
-; SI-NEXT: v_readlane_b32 s65, v40, 17
-; SI-NEXT: v_readlane_b32 s64, v40, 16
-; SI-NEXT: v_readlane_b32 s55, v40, 15
-; SI-NEXT: v_readlane_b32 s54, v40, 14
-; SI-NEXT: v_readlane_b32 s53, v40, 13
-; SI-NEXT: v_readlane_b32 s52, v40, 12
-; SI-NEXT: v_readlane_b32 s51, v40, 11
-; SI-NEXT: v_readlane_b32 s50, v40, 10
-; SI-NEXT: v_readlane_b32 s49, v40, 9
-; SI-NEXT: v_readlane_b32 s48, v40, 8
-; SI-NEXT: v_readlane_b32 s39, v40, 7
-; SI-NEXT: v_readlane_b32 s38, v40, 6
-; SI-NEXT: v_readlane_b32 s37, v40, 5
-; SI-NEXT: v_readlane_b32 s36, v40, 4
-; SI-NEXT: v_readlane_b32 s35, v40, 3
-; SI-NEXT: v_readlane_b32 s34, v40, 2
-; SI-NEXT: v_readlane_b32 s31, v40, 1
-; SI-NEXT: v_readlane_b32 s30, v40, 0
+; SI-NEXT: v_readlane_b32 s31, v40, 29
+; SI-NEXT: v_readlane_b32 s85, v40, 27
+; SI-NEXT: v_readlane_b32 s84, v40, 26
+; SI-NEXT: v_readlane_b32 s83, v40, 25
+; SI-NEXT: v_readlane_b32 s82, v40, 24
+; SI-NEXT: v_readlane_b32 s81, v40, 23
+; SI-NEXT: v_readlane_b32 s80, v40, 22
+; SI-NEXT: v_readlane_b32 s71, v40, 21
+; SI-NEXT: v_readlane_b32 s70, v40, 20
+; SI-NEXT: v_readlane_b32 s69, v40, 19
+; SI-NEXT: v_readlane_b32 s68, v40, 18
+; SI-NEXT: v_readlane_b32 s67, v40, 17
+; SI-NEXT: v_readlane_b32 s66, v40, 16
+; SI-NEXT: v_readlane_b32 s65, v40, 15
+; SI-NEXT: v_readlane_b32 s64, v40, 14
+; SI-NEXT: v_readlane_b32 s55, v40, 13
+; SI-NEXT: v_readlane_b32 s54, v40, 12
+; SI-NEXT: v_readlane_b32 s53, v40, 11
+; SI-NEXT: v_readlane_b32 s52, v40, 10
+; SI-NEXT: v_readlane_b32 s51, v40, 9
+; SI-NEXT: v_readlane_b32 s50, v40, 8
+; SI-NEXT: v_readlane_b32 s49, v40, 7
+; SI-NEXT: v_readlane_b32 s48, v40, 6
+; SI-NEXT: v_readlane_b32 s39, v40, 5
+; SI-NEXT: v_readlane_b32 s38, v40, 4
+; SI-NEXT: v_readlane_b32 s37, v40, 3
+; SI-NEXT: v_readlane_b32 s36, v40, 2
+; SI-NEXT: v_readlane_b32 s35, v40, 1
+; SI-NEXT: v_readlane_b32 s34, v40, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -25487,18 +25487,18 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-NEXT: s_cmp_lg_u32 s28, 0
; GFX11-NEXT: s_mov_b32 s42, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-NEXT: v_writelane_b32 v40, s35, 1
+; GFX11-NEXT: v_writelane_b32 v40, s36, 2
+; GFX11-NEXT: v_writelane_b32 v40, s37, 3
+; GFX11-NEXT: v_writelane_b32 v40, s38, 4
+; GFX11-NEXT: v_writelane_b32 v40, s39, 5
+; GFX11-NEXT: v_writelane_b32 v40, s48, 6
+; GFX11-NEXT: v_writelane_b32 v40, s49, 7
+; GFX11-NEXT: v_writelane_b32 v40, s30, 8
+; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_cbranch_scc0 .LBB49_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s43, s27, 24
@@ -25766,21 +25766,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
; GFX11-NEXT: v_or_b32_e32 v2, v4, v17
; GFX11-NEXT: v_or_b32_e32 v3, v19, v15
; GFX11-NEXT: v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
+; GFX11-NEXT: v_readlane_b32 s49, v40, 7
+; GFX11-NEXT: v_readlane_b32 s48, v40, 6
+; GFX11-NEXT: v_readlane_b32 s39, v40, 5
+; GFX11-NEXT: v_readlane_b32 s38, v40, 4
+; GFX11-NEXT: v_readlane_b32 s37, v40, 3
+; GFX11-NEXT: v_readlane_b32 s36, v40, 2
+; GFX11-NEXT: v_readlane_b32 s35, v40, 1
+; GFX11-NEXT: v_readlane_b32 s34, v40, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -25803,7 +25803,7 @@ end:
ret <64 x i8> %phi
}
-define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
+define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28079,7 +28079,7 @@ end:
ret <16 x float> %phi
}
-define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v16f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29755,7 +29755,7 @@ end:
ret <16 x float> %phi
}
-define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) {
+define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29893,7 +29893,7 @@ end:
ret <8 x double> %phi
}
-define inreg <8 x double> @bitcast_v8i64_to_v8f64_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v8i64_to_v8f64_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v8f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30090,7 +30090,7 @@ end:
ret <8 x double> %phi
}
-define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) {
+define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30192,7 +30192,7 @@ end:
ret <8 x i64> %phi
}
-define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v8i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30362,7 +30362,7 @@ end:
ret <8 x i64> %phi
}
-define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) {
+define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30599,7 +30599,7 @@ end:
ret <32 x i16> %phi
}
-define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v32i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30922,7 +30922,7 @@ end:
ret <32 x i16> %phi
}
-define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
+define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31258,7 +31258,7 @@ end:
ret <8 x i64> %phi
}
-define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v8i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31693,7 +31693,7 @@ end:
ret <8 x i64> %phi
}
-define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) {
+define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31930,7 +31930,7 @@ end:
ret <32 x half> %phi
}
-define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32253,7 +32253,7 @@ end:
ret <32 x half> %phi
}
-define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) {
+define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32653,7 +32653,7 @@ end:
ret <8 x i64> %phi
}
-define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v8i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33065,7 +33065,7 @@ end:
ret <8 x i64> %phi
}
-define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33382,7 +33382,7 @@ end:
ret <32 x bfloat> %phi
}
-define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33753,7 +33753,7 @@ end:
ret <32 x bfloat> %phi
}
-define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
+define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35159,7 +35159,7 @@ end:
ret <8 x i64> %phi
}
-define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v8i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36773,7 +36773,7 @@ end:
ret <8 x i64> %phi
}
-define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
+define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38250,7 +38250,7 @@ end:
ret <64 x i8> %phi
}
-define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i64_to_v64i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38258,34 +38258,34 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v18, s30, 0
-; SI-NEXT: v_writelane_b32 v18, s31, 1
-; SI-NEXT: v_writelane_b32 v18, s34, 2
-; SI-NEXT: v_writelane_b32 v18, s35, 3
-; SI-NEXT: v_writelane_b32 v18, s36, 4
-; SI-NEXT: v_writelane_b32 v18, s37, 5
-; SI-NEXT: v_writelane_b32 v18, s38, 6
-; SI-NEXT: v_writelane_b32 v18, s39, 7
-; SI-NEXT: v_writelane_b32 v18, s48, 8
-; SI-NEXT: v_writelane_b32 v18, s49, 9
-; SI-NEXT: v_writelane_b32 v18, s50, 10
-; SI-NEXT: v_writelane_b32 v18, s51, 11
-; SI-NEXT: v_writelane_b32 v18, s52, 12
-; SI-NEXT: v_writelane_b32 v18, s53, 13
-; SI-NEXT: v_writelane_b32 v18, s54, 14
-; SI-NEXT: v_writelane_b32 v18, s55, 15
-; SI-NEXT: v_writelane_b32 v18, s64, 16
-; SI-NEXT: v_writelane_b32 v18, s65, 17
-; SI-NEXT: v_writelane_b32 v18, s66, 18
-; SI-NEXT: v_writelane_b32 v18, s67, 19
-; SI-NEXT: v_writelane_b32 v18, s68, 20
-; SI-NEXT: v_writelane_b32 v18, s69, 21
-; SI-NEXT: v_writelane_b32 v18, s70, 22
-; SI-NEXT: v_writelane_b32 v18, s71, 23
-; SI-NEXT: v_writelane_b32 v18, s80, 24
-; SI-NEXT: v_writelane_b32 v18, s81, 25
-; SI-NEXT: v_writelane_b32 v18, s82, 26
-; SI-NEXT: v_writelane_b32 v18, s83, 27
+; SI-NEXT: v_writelane_b32 v18, s34, 0
+; SI-NEXT: v_writelane_b32 v18, s35, 1
+; SI-NEXT: v_writelane_b32 v18, s36, 2
+; SI-NEXT: v_writelane_b32 v18, s37, 3
+; SI-NEXT: v_writelane_b32 v18, s38, 4
+; SI-NEXT: v_writelane_b32 v18, s39, 5
+; SI-NEXT: v_writelane_b32 v18, s48, 6
+; SI-NEXT: v_writelane_b32 v18, s49, 7
+; SI-NEXT: v_writelane_b32 v18, s50, 8
+; SI-NEXT: v_writelane_b32 v18, s51, 9
+; SI-NEXT: v_writelane_b32 v18, s52, 10
+; SI-NEXT: v_writelane_b32 v18, s53, 11
+; SI-NEXT: v_writelane_b32 v18, s54, 12
+; SI-NEXT: v_writelane_b32 v18, s55, 13
+; SI-NEXT: v_writelane_b32 v18, s64, 14
+; SI-NEXT: v_writelane_b32 v18, s65, 15
+; SI-NEXT: v_writelane_b32 v18, s66, 16
+; SI-NEXT: v_writelane_b32 v18, s67, 17
+; SI-NEXT: v_writelane_b32 v18, s68, 18
+; SI-NEXT: v_writelane_b32 v18, s69, 19
+; SI-NEXT: v_writelane_b32 v18, s70, 20
+; SI-NEXT: v_writelane_b32 v18, s71, 21
+; SI-NEXT: v_writelane_b32 v18, s80, 22
+; SI-NEXT: v_writelane_b32 v18, s81, 23
+; SI-NEXT: v_writelane_b32 v18, s82, 24
+; SI-NEXT: v_writelane_b32 v18, s83, 25
+; SI-NEXT: v_writelane_b32 v18, s84, 26
+; SI-NEXT: v_writelane_b32 v18, s85, 27
; SI-NEXT: v_mov_b32_e32 v4, s16
; SI-NEXT: v_mov_b32_e32 v5, s17
; SI-NEXT: v_mov_b32_e32 v6, s18
@@ -38301,7 +38301,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; SI-NEXT: v_mov_b32_e32 v16, s28
; SI-NEXT: v_mov_b32_e32 v17, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_writelane_b32 v18, s84, 28
+; SI-NEXT: v_writelane_b32 v18, s30, 28
; SI-NEXT: v_readfirstlane_b32 s18, v4
; SI-NEXT: v_readfirstlane_b32 s19, v5
; SI-NEXT: v_readfirstlane_b32 s16, v6
@@ -38319,7 +38319,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; SI-NEXT: v_readfirstlane_b32 s4, v1
; SI-NEXT: s_and_b64 s[20:21], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v2
-; SI-NEXT: v_writelane_b32 v18, s85, 29
+; SI-NEXT: v_writelane_b32 v18, s31, 29
; SI-NEXT: s_cbranch_scc0 .LBB69_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s38, s5, 24
@@ -38640,37 +38640,37 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v18, 28
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s85, v18, 29
-; SI-NEXT: v_readlane_b32 s84, v18, 28
-; SI-NEXT: v_readlane_b32 s83, v18, 27
-; SI-NEXT: v_readlane_b32 s82, v18, 26
-; SI-NEXT: v_readlane_b32 s81, v18, 25
-; SI-NEXT: v_readlane_b32 s80, v18, 24
-; SI-NEXT: v_readlane_b32 s71, v18, 23
-; SI-NEXT: v_readlane_b32 s70, v18, 22
-; SI-NEXT: v_readlane_b32 s69, v18, 21
-; SI-NEXT: v_readlane_b32 s68, v18, 20
-; SI-NEXT: v_readlane_b32 s67, v18, 19
-; SI-NEXT: v_readlane_b32 s66, v18, 18
-; SI-NEXT: v_readlane_b32 s65, v18, 17
-; SI-NEXT: v_readlane_b32 s64, v18, 16
-; SI-NEXT: v_readlane_b32 s55, v18, 15
-; SI-NEXT: v_readlane_b32 s54, v18, 14
-; SI-NEXT: v_readlane_b32 s53, v18, 13
-; SI-NEXT: v_readlane_b32 s52, v18, 12
-; SI-NEXT: v_readlane_b32 s51, v18, 11
-; SI-NEXT: v_readlane_b32 s50, v18, 10
-; SI-NEXT: v_readlane_b32 s49, v18, 9
-; SI-NEXT: v_readlane_b32 s48, v18, 8
-; SI-NEXT: v_readlane_b32 s39, v18, 7
-; SI-NEXT: v_readlane_b32 s38, v18, 6
-; SI-NEXT: v_readlane_b32 s37, v18, 5
-; SI-NEXT: v_readlane_b32 s36, v18, 4
-; SI-NEXT: v_readlane_b32 s35, v18, 3
-; SI-NEXT: v_readlane_b32 s34, v18, 2
-; SI-NEXT: v_readlane_b32 s31, v18, 1
-; SI-NEXT: v_readlane_b32 s30, v18, 0
+; SI-NEXT: v_readlane_b32 s31, v18, 29
+; SI-NEXT: v_readlane_b32 s85, v18, 27
+; SI-NEXT: v_readlane_b32 s84, v18, 26
+; SI-NEXT: v_readlane_b32 s83, v18, 25
+; SI-NEXT: v_readlane_b32 s82, v18, 24
+; SI-NEXT: v_readlane_b32 s81, v18, 23
+; SI-NEXT: v_readlane_b32 s80, v18, 22
+; SI-NEXT: v_readlane_b32 s71, v18, 21
+; SI-NEXT: v_readlane_b32 s70, v18, 20
+; SI-NEXT: v_readlane_b32 s69, v18, 19
+; SI-NEXT: v_readlane_b32 s68, v18, 18
+; SI-NEXT: v_readlane_b32 s67, v18, 17
+; SI-NEXT: v_readlane_b32 s66, v18, 16
+; SI-NEXT: v_readlane_b32 s65, v18, 15
+; SI-NEXT: v_readlane_b32 s64, v18, 14
+; SI-NEXT: v_readlane_b32 s55, v18, 13
+; SI-NEXT: v_readlane_b32 s54, v18, 12
+; SI-NEXT: v_readlane_b32 s53, v18, 11
+; SI-NEXT: v_readlane_b32 s52, v18, 10
+; SI-NEXT: v_readlane_b32 s51, v18, 9
+; SI-NEXT: v_readlane_b32 s50, v18, 8
+; SI-NEXT: v_readlane_b32 s49, v18, 7
+; SI-NEXT: v_readlane_b32 s48, v18, 6
+; SI-NEXT: v_readlane_b32 s39, v18, 5
+; SI-NEXT: v_readlane_b32 s38, v18, 4
+; SI-NEXT: v_readlane_b32 s37, v18, 3
+; SI-NEXT: v_readlane_b32 s36, v18, 2
+; SI-NEXT: v_readlane_b32 s35, v18, 1
+; SI-NEXT: v_readlane_b32 s34, v18, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -38733,24 +38733,24 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v18, s30, 0
-; VI-NEXT: v_writelane_b32 v18, s31, 1
-; VI-NEXT: v_writelane_b32 v18, s34, 2
-; VI-NEXT: v_writelane_b32 v18, s35, 3
-; VI-NEXT: v_writelane_b32 v18, s36, 4
-; VI-NEXT: v_writelane_b32 v18, s37, 5
-; VI-NEXT: v_writelane_b32 v18, s38, 6
-; VI-NEXT: v_writelane_b32 v18, s39, 7
-; VI-NEXT: v_writelane_b32 v18, s48, 8
-; VI-NEXT: v_writelane_b32 v18, s49, 9
-; VI-NEXT: v_writelane_b32 v18, s50, 10
-; VI-NEXT: v_writelane_b32 v18, s51, 11
-; VI-NEXT: v_writelane_b32 v18, s52, 12
-; VI-NEXT: v_writelane_b32 v18, s53, 13
-; VI-NEXT: v_writelane_b32 v18, s54, 14
-; VI-NEXT: v_writelane_b32 v18, s55, 15
-; VI-NEXT: v_writelane_b32 v18, s64, 16
-; VI-NEXT: v_writelane_b32 v18, s65, 17
+; VI-NEXT: v_writelane_b32 v18, s34, 0
+; VI-NEXT: v_writelane_b32 v18, s35, 1
+; VI-NEXT: v_writelane_b32 v18, s36, 2
+; VI-NEXT: v_writelane_b32 v18, s37, 3
+; VI-NEXT: v_writelane_b32 v18, s38, 4
+; VI-NEXT: v_writelane_b32 v18, s39, 5
+; VI-NEXT: v_writelane_b32 v18, s48, 6
+; VI-NEXT: v_writelane_b32 v18, s49, 7
+; VI-NEXT: v_writelane_b32 v18, s50, 8
+; VI-NEXT: v_writelane_b32 v18, s51, 9
+; VI-NEXT: v_writelane_b32 v18, s52, 10
+; VI-NEXT: v_writelane_b32 v18, s53, 11
+; VI-NEXT: v_writelane_b32 v18, s54, 12
+; VI-NEXT: v_writelane_b32 v18, s55, 13
+; VI-NEXT: v_writelane_b32 v18, s64, 14
+; VI-NEXT: v_writelane_b32 v18, s65, 15
+; VI-NEXT: v_writelane_b32 v18, s66, 16
+; VI-NEXT: v_writelane_b32 v18, s67, 17
; VI-NEXT: v_mov_b32_e32 v4, s16
; VI-NEXT: v_mov_b32_e32 v5, s17
; VI-NEXT: v_mov_b32_e32 v6, s18
@@ -38766,7 +38766,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; VI-NEXT: v_mov_b32_e32 v16, s28
; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v18, s66, 18
+; VI-NEXT: v_writelane_b32 v18, s30, 18
; VI-NEXT: v_readfirstlane_b32 s18, v4
; VI-NEXT: v_readfirstlane_b32 s19, v5
; VI-NEXT: v_readfirstlane_b32 s16, v6
@@ -38784,7 +38784,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; VI-NEXT: v_readfirstlane_b32 s4, v1
; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
-; VI-NEXT: v_writelane_b32 v18, s67, 19
+; VI-NEXT: v_writelane_b32 v18, s31, 19
; VI-NEXT: s_cbranch_scc0 .LBB69_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -39029,27 +39029,27 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT: v_readlane_b32 s30, v18, 18
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s67, v18, 19
-; VI-NEXT: v_readlane_b32 s66, v18, 18
-; VI-NEXT: v_readlane_b32 s65, v18, 17
-; VI-NEXT: v_readlane_b32 s64, v18, 16
-; VI-NEXT: v_readlane_b32 s55, v18, 15
-; VI-NEXT: v_readlane_b32 s54, v18, 14
-; VI-NEXT: v_readlane_b32 s53, v18, 13
-; VI-NEXT: v_readlane_b32 s52, v18, 12
-; VI-NEXT: v_readlane_b32 s51, v18, 11
-; VI-NEXT: v_readlane_b32 s50, v18, 10
-; VI-NEXT: v_readlane_b32 s49, v18, 9
-; VI-NEXT: v_readlane_b32 s48, v18, 8
-; VI-NEXT: v_readlane_b32 s39, v18, 7
-; VI-NEXT: v_readlane_b32 s38, v18, 6
-; VI-NEXT: v_readlane_b32 s37, v18, 5
-; VI-NEXT: v_readlane_b32 s36, v18, 4
-; VI-NEXT: v_readlane_b32 s35, v18, 3
-; VI-NEXT: v_readlane_b32 s34, v18, 2
-; VI-NEXT: v_readlane_b32 s31, v18, 1
-; VI-NEXT: v_readlane_b32 s30, v18, 0
+; VI-NEXT: v_readlane_b32 s31, v18, 19
+; VI-NEXT: v_readlane_b32 s67, v18, 17
+; VI-NEXT: v_readlane_b32 s66, v18, 16
+; VI-NEXT: v_readlane_b32 s65, v18, 15
+; VI-NEXT: v_readlane_b32 s64, v18, 14
+; VI-NEXT: v_readlane_b32 s55, v18, 13
+; VI-NEXT: v_readlane_b32 s54, v18, 12
+; VI-NEXT: v_readlane_b32 s53, v18, 11
+; VI-NEXT: v_readlane_b32 s52, v18, 10
+; VI-NEXT: v_readlane_b32 s51, v18, 9
+; VI-NEXT: v_readlane_b32 s50, v18, 8
+; VI-NEXT: v_readlane_b32 s49, v18, 7
+; VI-NEXT: v_readlane_b32 s48, v18, 6
+; VI-NEXT: v_readlane_b32 s39, v18, 5
+; VI-NEXT: v_readlane_b32 s38, v18, 4
+; VI-NEXT: v_readlane_b32 s37, v18, 3
+; VI-NEXT: v_readlane_b32 s36, v18, 2
+; VI-NEXT: v_readlane_b32 s35, v18, 1
+; VI-NEXT: v_readlane_b32 s34, v18, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -39112,20 +39112,20 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v18, s30, 0
-; GFX9-NEXT: v_writelane_b32 v18, s31, 1
-; GFX9-NEXT: v_writelane_b32 v18, s34, 2
-; GFX9-NEXT: v_writelane_b32 v18, s35, 3
-; GFX9-NEXT: v_writelane_b32 v18, s36, 4
-; GFX9-NEXT: v_writelane_b32 v18, s37, 5
-; GFX9-NEXT: v_writelane_b32 v18, s38, 6
-; GFX9-NEXT: v_writelane_b32 v18, s39, 7
-; GFX9-NEXT: v_writelane_b32 v18, s48, 8
-; GFX9-NEXT: v_writelane_b32 v18, s49, 9
-; GFX9-NEXT: v_writelane_b32 v18, s50, 10
-; GFX9-NEXT: v_writelane_b32 v18, s51, 11
-; GFX9-NEXT: v_writelane_b32 v18, s52, 12
-; GFX9-NEXT: v_writelane_b32 v18, s53, 13
+; GFX9-NEXT: v_writelane_b32 v18, s34, 0
+; GFX9-NEXT: v_writelane_b32 v18, s35, 1
+; GFX9-NEXT: v_writelane_b32 v18, s36, 2
+; GFX9-NEXT: v_writelane_b32 v18, s37, 3
+; GFX9-NEXT: v_writelane_b32 v18, s38, 4
+; GFX9-NEXT: v_writelane_b32 v18, s39, 5
+; GFX9-NEXT: v_writelane_b32 v18, s48, 6
+; GFX9-NEXT: v_writelane_b32 v18, s49, 7
+; GFX9-NEXT: v_writelane_b32 v18, s50, 8
+; GFX9-NEXT: v_writelane_b32 v18, s51, 9
+; GFX9-NEXT: v_writelane_b32 v18, s52, 10
+; GFX9-NEXT: v_writelane_b32 v18, s53, 11
+; GFX9-NEXT: v_writelane_b32 v18, s54, 12
+; GFX9-NEXT: v_writelane_b32 v18, s55, 13
; GFX9-NEXT: v_mov_b32_e32 v4, s16
; GFX9-NEXT: v_mov_b32_e32 v5, s17
; GFX9-NEXT: v_mov_b32_e32 v6, s18
@@ -39141,7 +39141,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9-NEXT: v_mov_b32_e32 v16, s28
; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_writelane_b32 v18, s54, 14
+; GFX9-NEXT: v_writelane_b32 v18, s30, 14
; GFX9-NEXT: v_readfirstlane_b32 s18, v4
; GFX9-NEXT: v_readfirstlane_b32 s19, v5
; GFX9-NEXT: v_readfirstlane_b32 s16, v6
@@ -39159,7 +39159,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: v_writelane_b32 v18, s55, 15
+; GFX9-NEXT: v_writelane_b32 v18, s31, 15
; GFX9-NEXT: s_cbranch_scc0 .LBB69_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s56, s5, 24
@@ -39389,23 +39389,23 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX9-NEXT: v_perm_b32 v2, s57, v3, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT: v_readlane_b32 s30, v18, 14
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: v_readlane_b32 s55, v18, 15
-; GFX9-NEXT: v_readlane_b32 s54, v18, 14
-; GFX9-NEXT: v_readlane_b32 s53, v18, 13
-; GFX9-NEXT: v_readlane_b32 s52, v18, 12
-; GFX9-NEXT: v_readlane_b32 s51, v18, 11
-; GFX9-NEXT: v_readlane_b32 s50, v18, 10
-; GFX9-NEXT: v_readlane_b32 s49, v18, 9
-; GFX9-NEXT: v_readlane_b32 s48, v18, 8
-; GFX9-NEXT: v_readlane_b32 s39, v18, 7
-; GFX9-NEXT: v_readlane_b32 s38, v18, 6
-; GFX9-NEXT: v_readlane_b32 s37, v18, 5
-; GFX9-NEXT: v_readlane_b32 s36, v18, 4
-; GFX9-NEXT: v_readlane_b32 s35, v18, 3
-; GFX9-NEXT: v_readlane_b32 s34, v18, 2
-; GFX9-NEXT: v_readlane_b32 s31, v18, 1
-; GFX9-NEXT: v_readlane_b32 s30, v18, 0
+; GFX9-NEXT: v_readlane_b32 s31, v18, 15
+; GFX9-NEXT: v_readlane_b32 s55, v18, 13
+; GFX9-NEXT: v_readlane_b32 s54, v18, 12
+; GFX9-NEXT: v_readlane_b32 s53, v18, 11
+; GFX9-NEXT: v_readlane_b32 s52, v18, 10
+; GFX9-NEXT: v_readlane_b32 s51, v18, 9
+; GFX9-NEXT: v_readlane_b32 s50, v18, 8
+; GFX9-NEXT: v_readlane_b32 s49, v18, 7
+; GFX9-NEXT: v_readlane_b32 s48, v18, 6
+; GFX9-NEXT: v_readlane_b32 s39, v18, 5
+; GFX9-NEXT: v_readlane_b32 s38, v18, 4
+; GFX9-NEXT: v_readlane_b32 s37, v18, 3
+; GFX9-NEXT: v_readlane_b32 s36, v18, 2
+; GFX9-NEXT: v_readlane_b32 s35, v18, 1
+; GFX9-NEXT: v_readlane_b32 s34, v18, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -39468,17 +39468,17 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX11-NEXT: s_xor_saveexec_b32 s4, -1
; GFX11-NEXT: scratch_store_b32 off, v23, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v23, s30, 0
+; GFX11-NEXT: v_writelane_b32 v23, s34, 0
; GFX11-NEXT: s_cmp_lg_u32 s28, 0
; GFX11-NEXT: s_mov_b32 vcc_lo, 0
-; GFX11-NEXT: v_writelane_b32 v23, s31, 1
-; GFX11-NEXT: v_writelane_b32 v23, s34, 2
-; GFX11-NEXT: v_writelane_b32 v23, s35, 3
-; GFX11-NEXT: v_writelane_b32 v23, s36, 4
-; GFX11-NEXT: v_writelane_b32 v23, s37, 5
-; GFX11-NEXT: v_writelane_b32 v23, s38, 6
-; GFX11-NEXT: v_writelane_b32 v23, s39, 7
-; GFX11-NEXT: v_writelane_b32 v23, s48, 8
+; GFX11-NEXT: v_writelane_b32 v23, s35, 1
+; GFX11-NEXT: v_writelane_b32 v23, s36, 2
+; GFX11-NEXT: v_writelane_b32 v23, s37, 3
+; GFX11-NEXT: v_writelane_b32 v23, s38, 4
+; GFX11-NEXT: v_writelane_b32 v23, s39, 5
+; GFX11-NEXT: v_writelane_b32 v23, s48, 6
+; GFX11-NEXT: v_writelane_b32 v23, s30, 7
+; GFX11-NEXT: v_writelane_b32 v23, s31, 8
; GFX11-NEXT: s_cbranch_scc0 .LBB69_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s42, s27, 24
@@ -39600,7 +39600,7 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX11-NEXT: v_mov_b32_e32 v12, 0xc0c0004
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_perm_b32 v5, s34, s28, v12
-; GFX11-NEXT: v_readlane_b32 s34, v23, 2
+; GFX11-NEXT: v_readlane_b32 s34, v23, 0
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_perm_b32 v2, s39, s40, v12
; GFX11-NEXT: v_perm_b32 v4, s37, s36, v12
@@ -39664,19 +39664,19 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
; GFX11-NEXT: v_or_b32_e32 v14, v19, v17
; GFX11-NEXT: v_or_b32_e32 v15, v21, v18
; GFX11-NEXT: v_or_b32_e32 v16, v22, v20
+; GFX11-NEXT: v_readlane_b32 s30, v23, 7
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT: v_readlane_b32 s48, v23, 8
-; GFX11-NEXT: v_readlane_b32 s39, v23, 7
-; GFX11-NEXT: v_readlane_b32 s38, v23, 6
-; GFX11-NEXT: v_readlane_b32 s37, v23, 5
-; GFX11-NEXT: v_readlane_b32 s36, v23, 4
-; GFX11-NEXT: v_readlane_b32 s35, v23, 3
-; GFX11-NEXT: v_readlane_b32 s31, v23, 1
-; GFX11-NEXT: v_readlane_b32 s30, v23, 0
+; GFX11-NEXT: v_readlane_b32 s31, v23, 8
+; GFX11-NEXT: v_readlane_b32 s48, v23, 6
+; GFX11-NEXT: v_readlane_b32 s39, v23, 5
+; GFX11-NEXT: v_readlane_b32 s38, v23, 4
+; GFX11-NEXT: v_readlane_b32 s37, v23, 3
+; GFX11-NEXT: v_readlane_b32 s36, v23, 2
+; GFX11-NEXT: v_readlane_b32 s35, v23, 1
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v23, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -39749,7 +39749,7 @@ end:
ret <64 x i8> %phi
}
-define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
+define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42025,7 +42025,7 @@ end:
ret <8 x i64> %phi
}
-define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v8i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43701,7 +43701,7 @@ end:
ret <8 x i64> %phi
}
-define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) {
+define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43902,7 +43902,7 @@ end:
ret <32 x i16> %phi
}
-define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v32i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44167,7 +44167,7 @@ end:
ret <32 x i16> %phi
}
-define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
+define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44503,7 +44503,7 @@ end:
ret <8 x double> %phi
}
-define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v8f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44938,7 +44938,7 @@ end:
ret <8 x double> %phi
}
-define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) {
+define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45139,7 +45139,7 @@ end:
ret <32 x half> %phi
}
-define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45404,7 +45404,7 @@ end:
ret <32 x half> %phi
}
-define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) {
+define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45804,7 +45804,7 @@ end:
ret <8 x double> %phi
}
-define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v8f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46216,7 +46216,7 @@ end:
ret <8 x double> %phi
}
-define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46489,7 +46489,7 @@ end:
ret <32 x bfloat> %phi
}
-define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46866,7 +46866,7 @@ end:
ret <32 x bfloat> %phi
}
-define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
+define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -48272,7 +48272,7 @@ end:
ret <8 x double> %phi
}
-define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v8f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49886,7 +49886,7 @@ end:
ret <8 x double> %phi
}
-define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
+define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -51313,7 +51313,7 @@ end:
ret <64 x i8> %phi
}
-define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8f64_to_v64i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -51321,36 +51321,36 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v40, s30, 0
-; SI-NEXT: v_writelane_b32 v40, s31, 1
-; SI-NEXT: v_writelane_b32 v40, s34, 2
-; SI-NEXT: v_writelane_b32 v40, s35, 3
-; SI-NEXT: v_writelane_b32 v40, s36, 4
-; SI-NEXT: v_writelane_b32 v40, s37, 5
-; SI-NEXT: v_writelane_b32 v40, s38, 6
-; SI-NEXT: v_writelane_b32 v40, s39, 7
-; SI-NEXT: v_writelane_b32 v40, s48, 8
-; SI-NEXT: v_writelane_b32 v40, s49, 9
-; SI-NEXT: v_writelane_b32 v40, s50, 10
-; SI-NEXT: v_writelane_b32 v40, s51, 11
-; SI-NEXT: v_writelane_b32 v40, s52, 12
-; SI-NEXT: v_writelane_b32 v40, s53, 13
-; SI-NEXT: v_writelane_b32 v40, s54, 14
-; SI-NEXT: v_writelane_b32 v40, s55, 15
-; SI-NEXT: v_writelane_b32 v40, s64, 16
-; SI-NEXT: v_writelane_b32 v40, s65, 17
-; SI-NEXT: v_writelane_b32 v40, s66, 18
-; SI-NEXT: v_writelane_b32 v40, s67, 19
-; SI-NEXT: v_writelane_b32 v40, s68, 20
-; SI-NEXT: v_writelane_b32 v40, s69, 21
-; SI-NEXT: v_writelane_b32 v40, s70, 22
-; SI-NEXT: v_writelane_b32 v40, s71, 23
-; SI-NEXT: v_writelane_b32 v40, s80, 24
-; SI-NEXT: v_writelane_b32 v40, s81, 25
-; SI-NEXT: v_writelane_b32 v40, s82, 26
-; SI-NEXT: v_writelane_b32 v40, s83, 27
-; SI-NEXT: v_writelane_b32 v40, s84, 28
-; SI-NEXT: v_writelane_b32 v40, s85, 29
+; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
; SI-NEXT: v_mov_b32_e32 v4, s16
; SI-NEXT: v_mov_b32_e32 v5, s17
; SI-NEXT: v_mov_b32_e32 v6, s18
@@ -51366,7 +51366,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v16, s28
; SI-NEXT: v_mov_b32_e32 v17, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_writelane_b32 v40, s86, 30
+; SI-NEXT: v_writelane_b32 v40, s30, 30
; SI-NEXT: v_readfirstlane_b32 s18, v4
; SI-NEXT: v_readfirstlane_b32 s19, v5
; SI-NEXT: v_readfirstlane_b32 s16, v6
@@ -51384,7 +51384,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_readfirstlane_b32 s4, v1
; SI-NEXT: s_and_b64 s[20:21], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v2
-; SI-NEXT: v_writelane_b32 v40, s87, 31
+; SI-NEXT: v_writelane_b32 v40, s31, 31
; SI-NEXT: s_cbranch_scc0 .LBB85_3
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s48, s5, 24
@@ -51783,39 +51783,39 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v40, 30
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s87, v40, 31
-; SI-NEXT: v_readlane_b32 s86, v40, 30
-; SI-NEXT: v_readlane_b32 s85, v40, 29
-; SI-NEXT: v_readlane_b32 s84, v40, 28
-; SI-NEXT: v_readlane_b32 s83, v40, 27
-; SI-NEXT: v_readlane_b32 s82, v40, 26
-; SI-NEXT: v_readlane_b32 s81, v40, 25
-; SI-NEXT: v_readlane_b32 s80, v40, 24
-; SI-NEXT: v_readlane_b32 s71, v40, 23
-; SI-NEXT: v_readlane_b32 s70, v40, 22
-; SI-NEXT: v_readlane_b32 s69, v40, 21
-; SI-NEXT: v_readlane_b32 s68, v40, 20
-; SI-NEXT: v_readlane_b32 s67, v40, 19
-; SI-NEXT: v_readlane_b32 s66, v40, 18
-; SI-NEXT: v_readlane_b32 s65, v40, 17
-; SI-NEXT: v_readlane_b32 s64, v40, 16
-; SI-NEXT: v_readlane_b32 s55, v40, 15
-; SI-NEXT: v_readlane_b32 s54, v40, 14
-; SI-NEXT: v_readlane_b32 s53, v40, 13
-; SI-NEXT: v_readlane_b32 s52, v40, 12
-; SI-NEXT: v_readlane_b32 s51, v40, 11
-; SI-NEXT: v_readlane_b32 s50, v40, 10
-; SI-NEXT: v_readlane_b32 s49, v40, 9
-; SI-NEXT: v_readlane_b32 s48, v40, 8
-; SI-NEXT: v_readlane_b32 s39, v40, 7
-; SI-NEXT: v_readlane_b32 s38, v40, 6
-; SI-NEXT: v_readlane_b32 s37, v40, 5
-; SI-NEXT: v_readlane_b32 s36, v40, 4
-; SI-NEXT: v_readlane_b32 s35, v40, 3
-; SI-NEXT: v_readlane_b32 s34, v40, 2
-; SI-NEXT: v_readlane_b32 s31, v40, 1
-; SI-NEXT: v_readlane_b32 s30, v40, 0
+; SI-NEXT: v_readlane_b32 s31, v40, 31
+; SI-NEXT: v_readlane_b32 s87, v40, 29
+; SI-NEXT: v_readlane_b32 s86, v40, 28
+; SI-NEXT: v_readlane_b32 s85, v40, 27
+; SI-NEXT: v_readlane_b32 s84, v40, 26
+; SI-NEXT: v_readlane_b32 s83, v40, 25
+; SI-NEXT: v_readlane_b32 s82, v40, 24
+; SI-NEXT: v_readlane_b32 s81, v40, 23
+; SI-NEXT: v_readlane_b32 s80, v40, 22
+; SI-NEXT: v_readlane_b32 s71, v40, 21
+; SI-NEXT: v_readlane_b32 s70, v40, 20
+; SI-NEXT: v_readlane_b32 s69, v40, 19
+; SI-NEXT: v_readlane_b32 s68, v40, 18
+; SI-NEXT: v_readlane_b32 s67, v40, 17
+; SI-NEXT: v_readlane_b32 s66, v40, 16
+; SI-NEXT: v_readlane_b32 s65, v40, 15
+; SI-NEXT: v_readlane_b32 s64, v40, 14
+; SI-NEXT: v_readlane_b32 s55, v40, 13
+; SI-NEXT: v_readlane_b32 s54, v40, 12
+; SI-NEXT: v_readlane_b32 s53, v40, 11
+; SI-NEXT: v_readlane_b32 s52, v40, 10
+; SI-NEXT: v_readlane_b32 s51, v40, 9
+; SI-NEXT: v_readlane_b32 s50, v40, 8
+; SI-NEXT: v_readlane_b32 s49, v40, 7
+; SI-NEXT: v_readlane_b32 s48, v40, 6
+; SI-NEXT: v_readlane_b32 s39, v40, 5
+; SI-NEXT: v_readlane_b32 s38, v40, 4
+; SI-NEXT: v_readlane_b32 s37, v40, 3
+; SI-NEXT: v_readlane_b32 s36, v40, 2
+; SI-NEXT: v_readlane_b32 s35, v40, 1
+; SI-NEXT: v_readlane_b32 s34, v40, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -52441,18 +52441,18 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-NEXT: s_cmp_lg_u32 s28, 0
; GFX11-NEXT: s_mov_b32 s42, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-NEXT: v_writelane_b32 v40, s35, 1
+; GFX11-NEXT: v_writelane_b32 v40, s36, 2
+; GFX11-NEXT: v_writelane_b32 v40, s37, 3
+; GFX11-NEXT: v_writelane_b32 v40, s38, 4
+; GFX11-NEXT: v_writelane_b32 v40, s39, 5
+; GFX11-NEXT: v_writelane_b32 v40, s48, 6
+; GFX11-NEXT: v_writelane_b32 v40, s49, 7
+; GFX11-NEXT: v_writelane_b32 v40, s30, 8
+; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_cbranch_scc0 .LBB85_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s90, s27, 24
@@ -52712,21 +52712,21 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
; GFX11-NEXT: v_or_b32_e32 v2, v4, v8
; GFX11-NEXT: v_or_b32_e32 v3, v10, v9
; GFX11-NEXT: v_or_b32_e32 v4, v12, v11
+; GFX11-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off
; GFX11-NEXT: scratch_store_b128 v0, v[15:18], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
+; GFX11-NEXT: v_readlane_b32 s49, v40, 7
+; GFX11-NEXT: v_readlane_b32 s48, v40, 6
+; GFX11-NEXT: v_readlane_b32 s39, v40, 5
+; GFX11-NEXT: v_readlane_b32 s38, v40, 4
+; GFX11-NEXT: v_readlane_b32 s37, v40, 3
+; GFX11-NEXT: v_readlane_b32 s36, v40, 2
+; GFX11-NEXT: v_readlane_b32 s35, v40, 1
+; GFX11-NEXT: v_readlane_b32 s34, v40, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -52749,7 +52749,7 @@ end:
ret <64 x i8> %phi
}
-define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
+define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -55025,7 +55025,7 @@ end:
ret <8 x double> %phi
}
-define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v8f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -56701,7 +56701,7 @@ end:
ret <8 x double> %phi
}
-define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) {
+define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -57149,7 +57149,7 @@ end:
ret <32 x half> %phi
}
-define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -57157,13 +57157,14 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v16, s30, 0
-; SI-NEXT: v_writelane_b32 v16, s31, 1
-; SI-NEXT: v_writelane_b32 v16, s34, 2
-; SI-NEXT: v_writelane_b32 v16, s35, 3
-; SI-NEXT: v_writelane_b32 v16, s36, 4
-; SI-NEXT: v_writelane_b32 v16, s37, 5
-; SI-NEXT: v_writelane_b32 v16, s38, 6
+; SI-NEXT: v_writelane_b32 v16, s34, 0
+; SI-NEXT: v_writelane_b32 v16, s35, 1
+; SI-NEXT: v_writelane_b32 v16, s36, 2
+; SI-NEXT: v_writelane_b32 v16, s37, 3
+; SI-NEXT: v_writelane_b32 v16, s38, 4
+; SI-NEXT: v_writelane_b32 v16, s39, 5
+; SI-NEXT: v_writelane_b32 v16, s30, 6
+; SI-NEXT: v_writelane_b32 v16, s31, 7
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; SI-NEXT: s_lshr_b32 s92, s29, 16
@@ -57181,7 +57182,6 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s78, s17, 16
; SI-NEXT: s_lshr_b32 s94, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_writelane_b32 v16, s39, 7
; SI-NEXT: v_readfirstlane_b32 s37, v1
; SI-NEXT: v_readfirstlane_b32 s38, v0
; SI-NEXT: v_readfirstlane_b32 s93, v3
@@ -57400,6 +57400,7 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s19, s41, 0xffff
; SI-NEXT: s_lshl_b32 s20, s93, 16
; SI-NEXT: s_or_b32 s19, s19, s20
+; SI-NEXT: v_readlane_b32 s30, v16, 6
; SI-NEXT: v_mov_b32_e32 v0, s14
; SI-NEXT: v_mov_b32_e32 v1, s15
; SI-NEXT: v_mov_b32_e32 v2, s12
@@ -57416,14 +57417,13 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v13, s17
; SI-NEXT: v_mov_b32_e32 v14, s18
; SI-NEXT: v_mov_b32_e32 v15, s19
-; SI-NEXT: v_readlane_b32 s39, v16, 7
-; SI-NEXT: v_readlane_b32 s38, v16, 6
-; SI-NEXT: v_readlane_b32 s37, v16, 5
-; SI-NEXT: v_readlane_b32 s36, v16, 4
-; SI-NEXT: v_readlane_b32 s35, v16, 3
-; SI-NEXT: v_readlane_b32 s34, v16, 2
-; SI-NEXT: v_readlane_b32 s31, v16, 1
-; SI-NEXT: v_readlane_b32 s30, v16, 0
+; SI-NEXT: v_readlane_b32 s31, v16, 7
+; SI-NEXT: v_readlane_b32 s39, v16, 5
+; SI-NEXT: v_readlane_b32 s38, v16, 4
+; SI-NEXT: v_readlane_b32 s37, v16, 3
+; SI-NEXT: v_readlane_b32 s36, v16, 2
+; SI-NEXT: v_readlane_b32 s35, v16, 1
+; SI-NEXT: v_readlane_b32 s34, v16, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -57695,7 +57695,7 @@ end:
ret <32 x half> %phi
}
-define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) {
+define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58047,7 +58047,7 @@ end:
ret <32 x i16> %phi
}
-define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v32i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58482,7 +58482,7 @@ end:
ret <32 x i16> %phi
}
-define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58877,7 +58877,7 @@ end:
ret <32 x bfloat> %phi
}
-define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58885,13 +58885,14 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v17, s30, 0
-; SI-NEXT: v_writelane_b32 v17, s31, 1
-; SI-NEXT: v_writelane_b32 v17, s34, 2
-; SI-NEXT: v_writelane_b32 v17, s35, 3
-; SI-NEXT: v_writelane_b32 v17, s36, 4
-; SI-NEXT: v_writelane_b32 v17, s37, 5
-; SI-NEXT: v_writelane_b32 v17, s38, 6
+; SI-NEXT: v_writelane_b32 v17, s34, 0
+; SI-NEXT: v_writelane_b32 v17, s35, 1
+; SI-NEXT: v_writelane_b32 v17, s36, 2
+; SI-NEXT: v_writelane_b32 v17, s37, 3
+; SI-NEXT: v_writelane_b32 v17, s38, 4
+; SI-NEXT: v_writelane_b32 v17, s39, 5
+; SI-NEXT: v_writelane_b32 v17, s30, 6
+; SI-NEXT: v_writelane_b32 v17, s31, 7
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; SI-NEXT: s_lshr_b32 s35, s29, 16
@@ -58909,7 +58910,6 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
; SI-NEXT: s_lshr_b32 s79, s17, 16
; SI-NEXT: s_lshr_b32 s78, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_writelane_b32 v17, s39, 7
; SI-NEXT: v_readfirstlane_b32 s38, v1
; SI-NEXT: v_readfirstlane_b32 s36, v0
; SI-NEXT: v_readfirstlane_b32 s39, v3
@@ -59127,15 +59127,15 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s9
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s6
+; SI-NEXT: v_readlane_b32 s30, v17, 6
; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16
-; SI-NEXT: v_readlane_b32 s39, v17, 7
-; SI-NEXT: v_readlane_b32 s38, v17, 6
-; SI-NEXT: v_readlane_b32 s37, v17, 5
-; SI-NEXT: v_readlane_b32 s36, v17, 4
-; SI-NEXT: v_readlane_b32 s35, v17, 3
-; SI-NEXT: v_readlane_b32 s34, v17, 2
-; SI-NEXT: v_readlane_b32 s31, v17, 1
-; SI-NEXT: v_readlane_b32 s30, v17, 0
+; SI-NEXT: v_readlane_b32 s31, v17, 7
+; SI-NEXT: v_readlane_b32 s39, v17, 5
+; SI-NEXT: v_readlane_b32 s38, v17, 4
+; SI-NEXT: v_readlane_b32 s37, v17, 3
+; SI-NEXT: v_readlane_b32 s36, v17, 2
+; SI-NEXT: v_readlane_b32 s35, v17, 1
+; SI-NEXT: v_readlane_b32 s34, v17, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -59423,7 +59423,7 @@ end:
ret <32 x bfloat> %phi
}
-define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
+define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -60897,7 +60897,7 @@ end:
ret <32 x i16> %phi
}
-define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v32i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -62516,7 +62516,7 @@ end:
ret <32 x i16> %phi
}
-define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
+define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -64480,7 +64480,7 @@ end:
ret <64 x i8> %phi
}
-define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32i16_to_v64i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -64489,41 +64489,40 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v6, s30, 0
-; SI-NEXT: v_writelane_b32 v6, s31, 1
-; SI-NEXT: v_writelane_b32 v6, s34, 2
-; SI-NEXT: v_writelane_b32 v6, s35, 3
-; SI-NEXT: v_writelane_b32 v6, s36, 4
-; SI-NEXT: v_writelane_b32 v6, s37, 5
-; SI-NEXT: v_writelane_b32 v6, s38, 6
-; SI-NEXT: v_writelane_b32 v6, s39, 7
-; SI-NEXT: v_writelane_b32 v6, s48, 8
-; SI-NEXT: v_writelane_b32 v6, s49, 9
-; SI-NEXT: v_writelane_b32 v6, s50, 10
-; SI-NEXT: v_writelane_b32 v6, s51, 11
-; SI-NEXT: v_writelane_b32 v6, s52, 12
-; SI-NEXT: v_writelane_b32 v6, s53, 13
-; SI-NEXT: v_writelane_b32 v6, s54, 14
-; SI-NEXT: v_writelane_b32 v6, s55, 15
-; SI-NEXT: v_writelane_b32 v6, s64, 16
-; SI-NEXT: v_writelane_b32 v6, s65, 17
-; SI-NEXT: v_writelane_b32 v6, s66, 18
-; SI-NEXT: v_writelane_b32 v6, s67, 19
-; SI-NEXT: v_writelane_b32 v6, s68, 20
-; SI-NEXT: v_writelane_b32 v6, s69, 21
-; SI-NEXT: v_writelane_b32 v6, s70, 22
-; SI-NEXT: v_writelane_b32 v6, s71, 23
-; SI-NEXT: v_writelane_b32 v6, s80, 24
-; SI-NEXT: v_writelane_b32 v6, s81, 25
-; SI-NEXT: v_writelane_b32 v6, s82, 26
-; SI-NEXT: v_writelane_b32 v6, s83, 27
-; SI-NEXT: v_writelane_b32 v6, s84, 28
-; SI-NEXT: v_writelane_b32 v6, s85, 29
-; SI-NEXT: v_writelane_b32 v6, s86, 30
-; SI-NEXT: v_writelane_b32 v6, s87, 31
-; SI-NEXT: v_writelane_b32 v6, s96, 32
-; SI-NEXT: v_writelane_b32 v6, s97, 33
-; SI-NEXT: v_writelane_b32 v6, s98, 34
+; SI-NEXT: v_writelane_b32 v6, s34, 0
+; SI-NEXT: v_writelane_b32 v6, s35, 1
+; SI-NEXT: v_writelane_b32 v6, s36, 2
+; SI-NEXT: v_writelane_b32 v6, s37, 3
+; SI-NEXT: v_writelane_b32 v6, s38, 4
+; SI-NEXT: v_writelane_b32 v6, s39, 5
+; SI-NEXT: v_writelane_b32 v6, s48, 6
+; SI-NEXT: v_writelane_b32 v6, s49, 7
+; SI-NEXT: v_writelane_b32 v6, s50, 8
+; SI-NEXT: v_writelane_b32 v6, s51, 9
+; SI-NEXT: v_writelane_b32 v6, s52, 10
+; SI-NEXT: v_writelane_b32 v6, s53, 11
+; SI-NEXT: v_writelane_b32 v6, s54, 12
+; SI-NEXT: v_writelane_b32 v6, s55, 13
+; SI-NEXT: v_writelane_b32 v6, s64, 14
+; SI-NEXT: v_writelane_b32 v6, s65, 15
+; SI-NEXT: v_writelane_b32 v6, s66, 16
+; SI-NEXT: v_writelane_b32 v6, s67, 17
+; SI-NEXT: v_writelane_b32 v6, s68, 18
+; SI-NEXT: v_writelane_b32 v6, s69, 19
+; SI-NEXT: v_writelane_b32 v6, s70, 20
+; SI-NEXT: v_writelane_b32 v6, s71, 21
+; SI-NEXT: v_writelane_b32 v6, s80, 22
+; SI-NEXT: v_writelane_b32 v6, s81, 23
+; SI-NEXT: v_writelane_b32 v6, s82, 24
+; SI-NEXT: v_writelane_b32 v6, s83, 25
+; SI-NEXT: v_writelane_b32 v6, s84, 26
+; SI-NEXT: v_writelane_b32 v6, s85, 27
+; SI-NEXT: v_writelane_b32 v6, s86, 28
+; SI-NEXT: v_writelane_b32 v6, s87, 29
+; SI-NEXT: v_writelane_b32 v6, s96, 30
+; SI-NEXT: v_writelane_b32 v6, s97, 31
+; SI-NEXT: v_writelane_b32 v6, s98, 32
+; SI-NEXT: v_writelane_b32 v6, s99, 33
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: s_lshr_b32 s68, s29, 16
@@ -64541,12 +64540,13 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: s_lshr_b32 s83, s17, 16
; SI-NEXT: s_lshr_b32 s84, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; SI-NEXT: v_writelane_b32 v6, s30, 34
; SI-NEXT: v_readfirstlane_b32 s56, v2
; SI-NEXT: v_readfirstlane_b32 s58, v1
; SI-NEXT: v_readfirstlane_b32 s69, v4
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s60, v5
-; SI-NEXT: v_writelane_b32 v6, s99, 35
+; SI-NEXT: v_writelane_b32 v6, s31, 35
; SI-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB97_4
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -65003,44 +65003,44 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v6, 34
; SI-NEXT: v_readlane_b32 s19, v7, 9
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s99, v6, 35
-; SI-NEXT: v_readlane_b32 s98, v6, 34
-; SI-NEXT: v_readlane_b32 s97, v6, 33
-; SI-NEXT: v_readlane_b32 s96, v6, 32
-; SI-NEXT: v_readlane_b32 s87, v6, 31
-; SI-NEXT: v_readlane_b32 s86, v6, 30
-; SI-NEXT: v_readlane_b32 s85, v6, 29
-; SI-NEXT: v_readlane_b32 s84, v6, 28
-; SI-NEXT: v_readlane_b32 s83, v6, 27
-; SI-NEXT: v_readlane_b32 s82, v6, 26
-; SI-NEXT: v_readlane_b32 s81, v6, 25
-; SI-NEXT: v_readlane_b32 s80, v6, 24
-; SI-NEXT: v_readlane_b32 s71, v6, 23
-; SI-NEXT: v_readlane_b32 s70, v6, 22
-; SI-NEXT: v_readlane_b32 s69, v6, 21
-; SI-NEXT: v_readlane_b32 s68, v6, 20
-; SI-NEXT: v_readlane_b32 s67, v6, 19
-; SI-NEXT: v_readlane_b32 s66, v6, 18
-; SI-NEXT: v_readlane_b32 s65, v6, 17
-; SI-NEXT: v_readlane_b32 s64, v6, 16
-; SI-NEXT: v_readlane_b32 s55, v6, 15
-; SI-NEXT: v_readlane_b32 s54, v6, 14
-; SI-NEXT: v_readlane_b32 s53, v6, 13
-; SI-NEXT: v_readlane_b32 s52, v6, 12
-; SI-NEXT: v_readlane_b32 s51, v6, 11
-; SI-NEXT: v_readlane_b32 s50, v6, 10
-; SI-NEXT: v_readlane_b32 s49, v6, 9
-; SI-NEXT: v_readlane_b32 s48, v6, 8
-; SI-NEXT: v_readlane_b32 s39, v6, 7
-; SI-NEXT: v_readlane_b32 s38, v6, 6
-; SI-NEXT: v_readlane_b32 s37, v6, 5
-; SI-NEXT: v_readlane_b32 s36, v6, 4
-; SI-NEXT: v_readlane_b32 s35, v6, 3
-; SI-NEXT: v_readlane_b32 s34, v6, 2
-; SI-NEXT: v_readlane_b32 s31, v6, 1
-; SI-NEXT: v_readlane_b32 s30, v6, 0
+; SI-NEXT: v_readlane_b32 s31, v6, 35
+; SI-NEXT: v_readlane_b32 s99, v6, 33
+; SI-NEXT: v_readlane_b32 s98, v6, 32
+; SI-NEXT: v_readlane_b32 s97, v6, 31
+; SI-NEXT: v_readlane_b32 s96, v6, 30
+; SI-NEXT: v_readlane_b32 s87, v6, 29
+; SI-NEXT: v_readlane_b32 s86, v6, 28
+; SI-NEXT: v_readlane_b32 s85, v6, 27
+; SI-NEXT: v_readlane_b32 s84, v6, 26
+; SI-NEXT: v_readlane_b32 s83, v6, 25
+; SI-NEXT: v_readlane_b32 s82, v6, 24
+; SI-NEXT: v_readlane_b32 s81, v6, 23
+; SI-NEXT: v_readlane_b32 s80, v6, 22
+; SI-NEXT: v_readlane_b32 s71, v6, 21
+; SI-NEXT: v_readlane_b32 s70, v6, 20
+; SI-NEXT: v_readlane_b32 s69, v6, 19
+; SI-NEXT: v_readlane_b32 s68, v6, 18
+; SI-NEXT: v_readlane_b32 s67, v6, 17
+; SI-NEXT: v_readlane_b32 s66, v6, 16
+; SI-NEXT: v_readlane_b32 s65, v6, 15
+; SI-NEXT: v_readlane_b32 s64, v6, 14
+; SI-NEXT: v_readlane_b32 s55, v6, 13
+; SI-NEXT: v_readlane_b32 s54, v6, 12
+; SI-NEXT: v_readlane_b32 s53, v6, 11
+; SI-NEXT: v_readlane_b32 s52, v6, 10
+; SI-NEXT: v_readlane_b32 s51, v6, 9
+; SI-NEXT: v_readlane_b32 s50, v6, 8
+; SI-NEXT: v_readlane_b32 s49, v6, 7
+; SI-NEXT: v_readlane_b32 s48, v6, 6
+; SI-NEXT: v_readlane_b32 s39, v6, 5
+; SI-NEXT: v_readlane_b32 s38, v6, 4
+; SI-NEXT: v_readlane_b32 s37, v6, 3
+; SI-NEXT: v_readlane_b32 s36, v6, 2
+; SI-NEXT: v_readlane_b32 s35, v6, 1
+; SI-NEXT: v_readlane_b32 s34, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -65115,24 +65115,24 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v18, s30, 0
-; VI-NEXT: v_writelane_b32 v18, s31, 1
-; VI-NEXT: v_writelane_b32 v18, s34, 2
-; VI-NEXT: v_writelane_b32 v18, s35, 3
-; VI-NEXT: v_writelane_b32 v18, s36, 4
-; VI-NEXT: v_writelane_b32 v18, s37, 5
-; VI-NEXT: v_writelane_b32 v18, s38, 6
-; VI-NEXT: v_writelane_b32 v18, s39, 7
-; VI-NEXT: v_writelane_b32 v18, s48, 8
-; VI-NEXT: v_writelane_b32 v18, s49, 9
-; VI-NEXT: v_writelane_b32 v18, s50, 10
-; VI-NEXT: v_writelane_b32 v18, s51, 11
-; VI-NEXT: v_writelane_b32 v18, s52, 12
-; VI-NEXT: v_writelane_b32 v18, s53, 13
-; VI-NEXT: v_writelane_b32 v18, s54, 14
-; VI-NEXT: v_writelane_b32 v18, s55, 15
-; VI-NEXT: v_writelane_b32 v18, s64, 16
-; VI-NEXT: v_writelane_b32 v18, s65, 17
+; VI-NEXT: v_writelane_b32 v18, s34, 0
+; VI-NEXT: v_writelane_b32 v18, s35, 1
+; VI-NEXT: v_writelane_b32 v18, s36, 2
+; VI-NEXT: v_writelane_b32 v18, s37, 3
+; VI-NEXT: v_writelane_b32 v18, s38, 4
+; VI-NEXT: v_writelane_b32 v18, s39, 5
+; VI-NEXT: v_writelane_b32 v18, s48, 6
+; VI-NEXT: v_writelane_b32 v18, s49, 7
+; VI-NEXT: v_writelane_b32 v18, s50, 8
+; VI-NEXT: v_writelane_b32 v18, s51, 9
+; VI-NEXT: v_writelane_b32 v18, s52, 10
+; VI-NEXT: v_writelane_b32 v18, s53, 11
+; VI-NEXT: v_writelane_b32 v18, s54, 12
+; VI-NEXT: v_writelane_b32 v18, s55, 13
+; VI-NEXT: v_writelane_b32 v18, s64, 14
+; VI-NEXT: v_writelane_b32 v18, s65, 15
+; VI-NEXT: v_writelane_b32 v18, s66, 16
+; VI-NEXT: v_writelane_b32 v18, s67, 17
; VI-NEXT: v_mov_b32_e32 v4, s16
; VI-NEXT: v_mov_b32_e32 v5, s17
; VI-NEXT: v_mov_b32_e32 v6, s18
@@ -65148,7 +65148,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI-NEXT: v_mov_b32_e32 v16, s28
; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v18, s66, 18
+; VI-NEXT: v_writelane_b32 v18, s30, 18
; VI-NEXT: v_readfirstlane_b32 s18, v4
; VI-NEXT: v_readfirstlane_b32 s19, v5
; VI-NEXT: v_readfirstlane_b32 s16, v6
@@ -65166,7 +65166,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI-NEXT: v_readfirstlane_b32 s4, v1
; VI-NEXT: s_and_b64 s[20:21], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
-; VI-NEXT: v_writelane_b32 v18, s67, 19
+; VI-NEXT: v_writelane_b32 v18, s31, 19
; VI-NEXT: s_cbranch_scc0 .LBB97_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
@@ -65475,27 +65475,27 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT: v_readlane_b32 s30, v18, 18
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_readlane_b32 s67, v18, 19
-; VI-NEXT: v_readlane_b32 s66, v18, 18
-; VI-NEXT: v_readlane_b32 s65, v18, 17
-; VI-NEXT: v_readlane_b32 s64, v18, 16
-; VI-NEXT: v_readlane_b32 s55, v18, 15
-; VI-NEXT: v_readlane_b32 s54, v18, 14
-; VI-NEXT: v_readlane_b32 s53, v18, 13
-; VI-NEXT: v_readlane_b32 s52, v18, 12
-; VI-NEXT: v_readlane_b32 s51, v18, 11
-; VI-NEXT: v_readlane_b32 s50, v18, 10
-; VI-NEXT: v_readlane_b32 s49, v18, 9
-; VI-NEXT: v_readlane_b32 s48, v18, 8
-; VI-NEXT: v_readlane_b32 s39, v18, 7
-; VI-NEXT: v_readlane_b32 s38, v18, 6
-; VI-NEXT: v_readlane_b32 s37, v18, 5
-; VI-NEXT: v_readlane_b32 s36, v18, 4
-; VI-NEXT: v_readlane_b32 s35, v18, 3
-; VI-NEXT: v_readlane_b32 s34, v18, 2
-; VI-NEXT: v_readlane_b32 s31, v18, 1
-; VI-NEXT: v_readlane_b32 s30, v18, 0
+; VI-NEXT: v_readlane_b32 s31, v18, 19
+; VI-NEXT: v_readlane_b32 s67, v18, 17
+; VI-NEXT: v_readlane_b32 s66, v18, 16
+; VI-NEXT: v_readlane_b32 s65, v18, 15
+; VI-NEXT: v_readlane_b32 s64, v18, 14
+; VI-NEXT: v_readlane_b32 s55, v18, 13
+; VI-NEXT: v_readlane_b32 s54, v18, 12
+; VI-NEXT: v_readlane_b32 s53, v18, 11
+; VI-NEXT: v_readlane_b32 s52, v18, 10
+; VI-NEXT: v_readlane_b32 s51, v18, 9
+; VI-NEXT: v_readlane_b32 s50, v18, 8
+; VI-NEXT: v_readlane_b32 s49, v18, 7
+; VI-NEXT: v_readlane_b32 s48, v18, 6
+; VI-NEXT: v_readlane_b32 s39, v18, 5
+; VI-NEXT: v_readlane_b32 s38, v18, 4
+; VI-NEXT: v_readlane_b32 s37, v18, 3
+; VI-NEXT: v_readlane_b32 s36, v18, 2
+; VI-NEXT: v_readlane_b32 s35, v18, 1
+; VI-NEXT: v_readlane_b32 s34, v18, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -65865,18 +65865,18 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-NEXT: s_cmp_lg_u32 s28, 0
; GFX11-NEXT: s_mov_b32 s42, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-NEXT: v_writelane_b32 v40, s35, 1
+; GFX11-NEXT: v_writelane_b32 v40, s36, 2
+; GFX11-NEXT: v_writelane_b32 v40, s37, 3
+; GFX11-NEXT: v_writelane_b32 v40, s38, 4
+; GFX11-NEXT: v_writelane_b32 v40, s39, 5
+; GFX11-NEXT: v_writelane_b32 v40, s48, 6
+; GFX11-NEXT: v_writelane_b32 v40, s49, 7
+; GFX11-NEXT: v_writelane_b32 v40, s30, 8
+; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_cbranch_scc0 .LBB97_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s43, s27, 24
@@ -66144,21 +66144,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; GFX11-NEXT: v_or_b32_e32 v2, v4, v17
; GFX11-NEXT: v_or_b32_e32 v3, v19, v15
; GFX11-NEXT: v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
+; GFX11-NEXT: v_readlane_b32 s49, v40, 7
+; GFX11-NEXT: v_readlane_b32 s48, v40, 6
+; GFX11-NEXT: v_readlane_b32 s39, v40, 5
+; GFX11-NEXT: v_readlane_b32 s38, v40, 4
+; GFX11-NEXT: v_readlane_b32 s37, v40, 3
+; GFX11-NEXT: v_readlane_b32 s36, v40, 2
+; GFX11-NEXT: v_readlane_b32 s35, v40, 1
+; GFX11-NEXT: v_readlane_b32 s34, v40, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -66181,7 +66181,7 @@ end:
ret <64 x i8> %phi
}
-define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
+define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -68660,7 +68660,7 @@ end:
ret <32 x i16> %phi
}
-define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v32i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -68677,49 +68677,49 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v40, s30, 0
-; SI-NEXT: v_writelane_b32 v40, s31, 1
-; SI-NEXT: v_writelane_b32 v40, s34, 2
-; SI-NEXT: v_writelane_b32 v40, s35, 3
-; SI-NEXT: v_writelane_b32 v40, s36, 4
-; SI-NEXT: v_writelane_b32 v40, s37, 5
-; SI-NEXT: v_writelane_b32 v40, s38, 6
-; SI-NEXT: v_writelane_b32 v40, s39, 7
-; SI-NEXT: v_writelane_b32 v40, s48, 8
-; SI-NEXT: v_writelane_b32 v40, s49, 9
-; SI-NEXT: v_writelane_b32 v40, s50, 10
-; SI-NEXT: v_writelane_b32 v40, s51, 11
-; SI-NEXT: v_writelane_b32 v40, s52, 12
-; SI-NEXT: v_writelane_b32 v40, s53, 13
-; SI-NEXT: v_writelane_b32 v40, s54, 14
-; SI-NEXT: v_writelane_b32 v40, s55, 15
-; SI-NEXT: v_writelane_b32 v40, s64, 16
-; SI-NEXT: v_writelane_b32 v40, s65, 17
-; SI-NEXT: v_writelane_b32 v40, s66, 18
-; SI-NEXT: v_writelane_b32 v40, s67, 19
-; SI-NEXT: v_writelane_b32 v40, s68, 20
-; SI-NEXT: v_writelane_b32 v40, s69, 21
-; SI-NEXT: v_writelane_b32 v40, s70, 22
-; SI-NEXT: v_writelane_b32 v40, s71, 23
-; SI-NEXT: v_writelane_b32 v40, s80, 24
-; SI-NEXT: v_writelane_b32 v40, s81, 25
-; SI-NEXT: v_writelane_b32 v40, s82, 26
-; SI-NEXT: v_writelane_b32 v40, s83, 27
-; SI-NEXT: v_writelane_b32 v40, s84, 28
-; SI-NEXT: v_writelane_b32 v40, s85, 29
-; SI-NEXT: v_writelane_b32 v40, s86, 30
-; SI-NEXT: v_writelane_b32 v40, s87, 31
-; SI-NEXT: v_writelane_b32 v40, s96, 32
+; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
+; SI-NEXT: v_writelane_b32 v40, s96, 30
; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v40, s97, 33
+; SI-NEXT: v_writelane_b32 v40, s97, 31
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v41, s28, 0
; SI-NEXT: v_writelane_b32 v41, s26, 1
+; SI-NEXT: v_writelane_b32 v40, s98, 32
; SI-NEXT: v_writelane_b32 v41, s23, 2
+; SI-NEXT: v_writelane_b32 v40, s99, 33
; SI-NEXT: v_writelane_b32 v41, s22, 3
-; SI-NEXT: v_writelane_b32 v40, s98, 34
+; SI-NEXT: v_writelane_b32 v40, s30, 34
; SI-NEXT: v_writelane_b32 v41, s21, 4
-; SI-NEXT: v_writelane_b32 v40, s99, 35
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: s_mov_b32 s88, s29
; SI-NEXT: s_mov_b32 s30, s25
; SI-NEXT: s_mov_b32 s29, s24
@@ -69377,6 +69377,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_lshl_b32 s20, s45, 16
; SI-NEXT: s_or_b32 s5, s5, s20
+; SI-NEXT: v_readlane_b32 s30, v40, 34
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
@@ -69393,42 +69394,41 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v13, s7
; SI-NEXT: v_mov_b32_e32 v14, s4
; SI-NEXT: v_mov_b32_e32 v15, s5
-; SI-NEXT: v_readlane_b32 s99, v40, 35
-; SI-NEXT: v_readlane_b32 s98, v40, 34
-; SI-NEXT: v_readlane_b32 s97, v40, 33
-; SI-NEXT: v_readlane_b32 s96, v40, 32
-; SI-NEXT: v_readlane_b32 s87, v40, 31
-; SI-NEXT: v_readlane_b32 s86, v40, 30
-; SI-NEXT: v_readlane_b32 s85, v40, 29
-; SI-NEXT: v_readlane_b32 s84, v40, 28
-; SI-NEXT: v_readlane_b32 s83, v40, 27
-; SI-NEXT: v_readlane_b32 s82, v40, 26
-; SI-NEXT: v_readlane_b32 s81, v40, 25
-; SI-NEXT: v_readlane_b32 s80, v40, 24
-; SI-NEXT: v_readlane_b32 s71, v40, 23
-; SI-NEXT: v_readlane_b32 s70, v40, 22
-; SI-NEXT: v_readlane_b32 s69, v40, 21
-; SI-NEXT: v_readlane_b32 s68, v40, 20
-; SI-NEXT: v_readlane_b32 s67, v40, 19
-; SI-NEXT: v_readlane_b32 s66, v40, 18
-; SI-NEXT: v_readlane_b32 s65, v40, 17
-; SI-NEXT: v_readlane_b32 s64, v40, 16
-; SI-NEXT: v_readlane_b32 s55, v40, 15
-; SI-NEXT: v_readlane_b32 s54, v40, 14
-; SI-NEXT: v_readlane_b32 s53, v40, 13
-; SI-NEXT: v_readlane_b32 s52, v40, 12
-; SI-NEXT: v_readlane_b32 s51, v40, 11
-; SI-NEXT: v_readlane_b32 s50, v40, 10
-; SI-NEXT: v_readlane_b32 s49, v40, 9
-; SI-NEXT: v_readlane_b32 s48, v40, 8
-; SI-NEXT: v_readlane_b32 s39, v40, 7
-; SI-NEXT: v_readlane_b32 s38, v40, 6
-; SI-NEXT: v_readlane_b32 s37, v40, 5
-; SI-NEXT: v_readlane_b32 s36, v40, 4
-; SI-NEXT: v_readlane_b32 s35, v40, 3
-; SI-NEXT: v_readlane_b32 s34, v40, 2
-; SI-NEXT: v_readlane_b32 s31, v40, 1
-; SI-NEXT: v_readlane_b32 s30, v40, 0
+; SI-NEXT: v_readlane_b32 s31, v40, 35
+; SI-NEXT: v_readlane_b32 s99, v40, 33
+; SI-NEXT: v_readlane_b32 s98, v40, 32
+; SI-NEXT: v_readlane_b32 s97, v40, 31
+; SI-NEXT: v_readlane_b32 s96, v40, 30
+; SI-NEXT: v_readlane_b32 s87, v40, 29
+; SI-NEXT: v_readlane_b32 s86, v40, 28
+; SI-NEXT: v_readlane_b32 s85, v40, 27
+; SI-NEXT: v_readlane_b32 s84, v40, 26
+; SI-NEXT: v_readlane_b32 s83, v40, 25
+; SI-NEXT: v_readlane_b32 s82, v40, 24
+; SI-NEXT: v_readlane_b32 s81, v40, 23
+; SI-NEXT: v_readlane_b32 s80, v40, 22
+; SI-NEXT: v_readlane_b32 s71, v40, 21
+; SI-NEXT: v_readlane_b32 s70, v40, 20
+; SI-NEXT: v_readlane_b32 s69, v40, 19
+; SI-NEXT: v_readlane_b32 s68, v40, 18
+; SI-NEXT: v_readlane_b32 s67, v40, 17
+; SI-NEXT: v_readlane_b32 s66, v40, 16
+; SI-NEXT: v_readlane_b32 s65, v40, 15
+; SI-NEXT: v_readlane_b32 s64, v40, 14
+; SI-NEXT: v_readlane_b32 s55, v40, 13
+; SI-NEXT: v_readlane_b32 s54, v40, 12
+; SI-NEXT: v_readlane_b32 s53, v40, 11
+; SI-NEXT: v_readlane_b32 s52, v40, 10
+; SI-NEXT: v_readlane_b32 s51, v40, 9
+; SI-NEXT: v_readlane_b32 s50, v40, 8
+; SI-NEXT: v_readlane_b32 s49, v40, 7
+; SI-NEXT: v_readlane_b32 s48, v40, 6
+; SI-NEXT: v_readlane_b32 s39, v40, 5
+; SI-NEXT: v_readlane_b32 s38, v40, 4
+; SI-NEXT: v_readlane_b32 s37, v40, 3
+; SI-NEXT: v_readlane_b32 s36, v40, 2
+; SI-NEXT: v_readlane_b32 s35, v40, 1
+; SI-NEXT: v_readlane_b32 s34, v40, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -70707,7 +70707,7 @@ end:
ret <32 x i16> %phi
}
-define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -71200,7 +71200,7 @@ end:
ret <32 x bfloat> %phi
}
-define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -71705,7 +71705,7 @@ end:
ret <32 x bfloat> %phi
}
-define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) {
+define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -73218,7 +73218,7 @@ end:
ret <32 x half> %phi
}
-define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -75131,7 +75131,7 @@ end:
ret <32 x half> %phi
}
-define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
+define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -76963,7 +76963,7 @@ end:
ret <64 x i8> %phi
}
-define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32f16_to_v64i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -76972,41 +76972,40 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v18, s30, 0
-; SI-NEXT: v_writelane_b32 v18, s31, 1
-; SI-NEXT: v_writelane_b32 v18, s34, 2
-; SI-NEXT: v_writelane_b32 v18, s35, 3
-; SI-NEXT: v_writelane_b32 v18, s36, 4
-; SI-NEXT: v_writelane_b32 v18, s37, 5
-; SI-NEXT: v_writelane_b32 v18, s38, 6
-; SI-NEXT: v_writelane_b32 v18, s39, 7
-; SI-NEXT: v_writelane_b32 v18, s48, 8
-; SI-NEXT: v_writelane_b32 v18, s49, 9
-; SI-NEXT: v_writelane_b32 v18, s50, 10
-; SI-NEXT: v_writelane_b32 v18, s51, 11
-; SI-NEXT: v_writelane_b32 v18, s52, 12
-; SI-NEXT: v_writelane_b32 v18, s53, 13
-; SI-NEXT: v_writelane_b32 v18, s54, 14
-; SI-NEXT: v_writelane_b32 v18, s55, 15
-; SI-NEXT: v_writelane_b32 v18, s64, 16
-; SI-NEXT: v_writelane_b32 v18, s65, 17
-; SI-NEXT: v_writelane_b32 v18, s66, 18
-; SI-NEXT: v_writelane_b32 v18, s67, 19
-; SI-NEXT: v_writelane_b32 v18, s68, 20
-; SI-NEXT: v_writelane_b32 v18, s69, 21
-; SI-NEXT: v_writelane_b32 v18, s70, 22
-; SI-NEXT: v_writelane_b32 v18, s71, 23
-; SI-NEXT: v_writelane_b32 v18, s80, 24
-; SI-NEXT: v_writelane_b32 v18, s81, 25
-; SI-NEXT: v_writelane_b32 v18, s82, 26
-; SI-NEXT: v_writelane_b32 v18, s83, 27
-; SI-NEXT: v_writelane_b32 v18, s84, 28
-; SI-NEXT: v_writelane_b32 v18, s85, 29
-; SI-NEXT: v_writelane_b32 v18, s86, 30
-; SI-NEXT: v_writelane_b32 v18, s87, 31
-; SI-NEXT: v_writelane_b32 v18, s96, 32
-; SI-NEXT: v_writelane_b32 v18, s97, 33
-; SI-NEXT: v_writelane_b32 v18, s98, 34
+; SI-NEXT: v_writelane_b32 v18, s34, 0
+; SI-NEXT: v_writelane_b32 v18, s35, 1
+; SI-NEXT: v_writelane_b32 v18, s36, 2
+; SI-NEXT: v_writelane_b32 v18, s37, 3
+; SI-NEXT: v_writelane_b32 v18, s38, 4
+; SI-NEXT: v_writelane_b32 v18, s39, 5
+; SI-NEXT: v_writelane_b32 v18, s48, 6
+; SI-NEXT: v_writelane_b32 v18, s49, 7
+; SI-NEXT: v_writelane_b32 v18, s50, 8
+; SI-NEXT: v_writelane_b32 v18, s51, 9
+; SI-NEXT: v_writelane_b32 v18, s52, 10
+; SI-NEXT: v_writelane_b32 v18, s53, 11
+; SI-NEXT: v_writelane_b32 v18, s54, 12
+; SI-NEXT: v_writelane_b32 v18, s55, 13
+; SI-NEXT: v_writelane_b32 v18, s64, 14
+; SI-NEXT: v_writelane_b32 v18, s65, 15
+; SI-NEXT: v_writelane_b32 v18, s66, 16
+; SI-NEXT: v_writelane_b32 v18, s67, 17
+; SI-NEXT: v_writelane_b32 v18, s68, 18
+; SI-NEXT: v_writelane_b32 v18, s69, 19
+; SI-NEXT: v_writelane_b32 v18, s70, 20
+; SI-NEXT: v_writelane_b32 v18, s71, 21
+; SI-NEXT: v_writelane_b32 v18, s80, 22
+; SI-NEXT: v_writelane_b32 v18, s81, 23
+; SI-NEXT: v_writelane_b32 v18, s82, 24
+; SI-NEXT: v_writelane_b32 v18, s83, 25
+; SI-NEXT: v_writelane_b32 v18, s84, 26
+; SI-NEXT: v_writelane_b32 v18, s85, 27
+; SI-NEXT: v_writelane_b32 v18, s86, 28
+; SI-NEXT: v_writelane_b32 v18, s87, 29
+; SI-NEXT: v_writelane_b32 v18, s96, 30
+; SI-NEXT: v_writelane_b32 v18, s97, 31
+; SI-NEXT: v_writelane_b32 v18, s98, 32
+; SI-NEXT: v_writelane_b32 v18, s99, 33
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: s_lshr_b32 s96, s29, 16
@@ -77024,12 +77023,13 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; SI-NEXT: s_lshr_b32 s68, s17, 16
; SI-NEXT: s_lshr_b32 s69, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_writelane_b32 v18, s99, 35
+; SI-NEXT: v_writelane_b32 v18, s30, 34
; SI-NEXT: v_readfirstlane_b32 s98, v2
; SI-NEXT: v_readfirstlane_b32 s44, v1
; SI-NEXT: v_readfirstlane_b32 s99, v4
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s46, v5
+; SI-NEXT: v_writelane_b32 v18, s31, 35
; SI-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB105_3
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -77613,43 +77613,43 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_or_b32_e32 v1, s4, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT: v_readlane_b32 s30, v18, 34
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s99, v18, 35
-; SI-NEXT: v_readlane_b32 s98, v18, 34
-; SI-NEXT: v_readlane_b32 s97, v18, 33
-; SI-NEXT: v_readlane_b32 s96, v18, 32
-; SI-NEXT: v_readlane_b32 s87, v18, 31
-; SI-NEXT: v_readlane_b32 s86, v18, 30
-; SI-NEXT: v_readlane_b32 s85, v18, 29
-; SI-NEXT: v_readlane_b32 s84, v18, 28
-; SI-NEXT: v_readlane_b32 s83, v18, 27
-; SI-NEXT: v_readlane_b32 s82, v18, 26
-; SI-NEXT: v_readlane_b32 s81, v18, 25
-; SI-NEXT: v_readlane_b32 s80, v18, 24
-; SI-NEXT: v_readlane_b32 s71, v18, 23
-; SI-NEXT: v_readlane_b32 s70, v18, 22
-; SI-NEXT: v_readlane_b32 s69, v18, 21
-; SI-NEXT: v_readlane_b32 s68, v18, 20
-; SI-NEXT: v_readlane_b32 s67, v18, 19
-; SI-NEXT: v_readlane_b32 s66, v18, 18
-; SI-NEXT: v_readlane_b32 s65, v18, 17
-; SI-NEXT: v_readlane_b32 s64, v18, 16
-; SI-NEXT: v_readlane_b32 s55, v18, 15
-; SI-NEXT: v_readlane_b32 s54, v18, 14
-; SI-NEXT: v_readlane_b32 s53, v18, 13
-; SI-NEXT: v_readlane_b32 s52, v18, 12
-; SI-NEXT: v_readlane_b32 s51, v18, 11
-; SI-NEXT: v_readlane_b32 s50, v18, 10
-; SI-NEXT: v_readlane_b32 s49, v18, 9
-; SI-NEXT: v_readlane_b32 s48, v18, 8
-; SI-NEXT: v_readlane_b32 s39, v18, 7
-; SI-NEXT: v_readlane_b32 s38, v18, 6
-; SI-NEXT: v_readlane_b32 s37, v18, 5
-; SI-NEXT: v_readlane_b32 s36, v18, 4
-; SI-NEXT: v_readlane_b32 s35, v18, 3
-; SI-NEXT: v_readlane_b32 s34, v18, 2
-; SI-NEXT: v_readlane_b32 s31, v18, 1
-; SI-NEXT: v_readlane_b32 s30, v18, 0
+; SI-NEXT: v_readlane_b32 s31, v18, 35
+; SI-NEXT: v_readlane_b32 s99, v18, 33
+; SI-NEXT: v_readlane_b32 s98, v18, 32
+; SI-NEXT: v_readlane_b32 s97, v18, 31
+; SI-NEXT: v_readlane_b32 s96, v18, 30
+; SI-NEXT: v_readlane_b32 s87, v18, 29
+; SI-NEXT: v_readlane_b32 s86, v18, 28
+; SI-NEXT: v_readlane_b32 s85, v18, 27
+; SI-NEXT: v_readlane_b32 s84, v18, 26
+; SI-NEXT: v_readlane_b32 s83, v18, 25
+; SI-NEXT: v_readlane_b32 s82, v18, 24
+; SI-NEXT: v_readlane_b32 s81, v18, 23
+; SI-NEXT: v_readlane_b32 s80, v18, 22
+; SI-NEXT: v_readlane_b32 s71, v18, 21
+; SI-NEXT: v_readlane_b32 s70, v18, 20
+; SI-NEXT: v_readlane_b32 s69, v18, 19
+; SI-NEXT: v_readlane_b32 s68, v18, 18
+; SI-NEXT: v_readlane_b32 s67, v18, 17
+; SI-NEXT: v_readlane_b32 s66, v18, 16
+; SI-NEXT: v_readlane_b32 s65, v18, 15
+; SI-NEXT: v_readlane_b32 s64, v18, 14
+; SI-NEXT: v_readlane_b32 s55, v18, 13
+; SI-NEXT: v_readlane_b32 s54, v18, 12
+; SI-NEXT: v_readlane_b32 s53, v18, 11
+; SI-NEXT: v_readlane_b32 s52, v18, 10
+; SI-NEXT: v_readlane_b32 s51, v18, 9
+; SI-NEXT: v_readlane_b32 s50, v18, 8
+; SI-NEXT: v_readlane_b32 s49, v18, 7
+; SI-NEXT: v_readlane_b32 s48, v18, 6
+; SI-NEXT: v_readlane_b32 s39, v18, 5
+; SI-NEXT: v_readlane_b32 s38, v18, 4
+; SI-NEXT: v_readlane_b32 s37, v18, 3
+; SI-NEXT: v_readlane_b32 s36, v18, 2
+; SI-NEXT: v_readlane_b32 s35, v18, 1
+; SI-NEXT: v_readlane_b32 s34, v18, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -77663,25 +77663,25 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v63, s30, 0
-; VI-NEXT: v_writelane_b32 v63, s31, 1
-; VI-NEXT: v_writelane_b32 v63, s34, 2
-; VI-NEXT: v_writelane_b32 v63, s35, 3
-; VI-NEXT: v_writelane_b32 v63, s36, 4
-; VI-NEXT: v_writelane_b32 v63, s37, 5
-; VI-NEXT: v_writelane_b32 v63, s38, 6
-; VI-NEXT: v_writelane_b32 v63, s39, 7
-; VI-NEXT: v_writelane_b32 v63, s48, 8
-; VI-NEXT: v_writelane_b32 v63, s49, 9
-; VI-NEXT: v_writelane_b32 v63, s50, 10
-; VI-NEXT: v_writelane_b32 v63, s51, 11
-; VI-NEXT: v_writelane_b32 v63, s52, 12
-; VI-NEXT: v_writelane_b32 v63, s53, 13
-; VI-NEXT: v_writelane_b32 v63, s54, 14
-; VI-NEXT: v_writelane_b32 v63, s55, 15
-; VI-NEXT: v_writelane_b32 v63, s64, 16
-; VI-NEXT: v_writelane_b32 v63, s65, 17
-; VI-NEXT: v_writelane_b32 v63, s66, 18
+; VI-NEXT: v_writelane_b32 v63, s34, 0
+; VI-NEXT: v_writelane_b32 v63, s35, 1
+; VI-NEXT: v_writelane_b32 v63, s36, 2
+; VI-NEXT: v_writelane_b32 v63, s37, 3
+; VI-NEXT: v_writelane_b32 v63, s38, 4
+; VI-NEXT: v_writelane_b32 v63, s39, 5
+; VI-NEXT: v_writelane_b32 v63, s48, 6
+; VI-NEXT: v_writelane_b32 v63, s49, 7
+; VI-NEXT: v_writelane_b32 v63, s50, 8
+; VI-NEXT: v_writelane_b32 v63, s51, 9
+; VI-NEXT: v_writelane_b32 v63, s52, 10
+; VI-NEXT: v_writelane_b32 v63, s53, 11
+; VI-NEXT: v_writelane_b32 v63, s54, 12
+; VI-NEXT: v_writelane_b32 v63, s55, 13
+; VI-NEXT: v_writelane_b32 v63, s64, 14
+; VI-NEXT: v_writelane_b32 v63, s65, 15
+; VI-NEXT: v_writelane_b32 v63, s66, 16
+; VI-NEXT: v_writelane_b32 v63, s67, 17
+; VI-NEXT: v_writelane_b32 v63, s30, 18
; VI-NEXT: v_mov_b32_e32 v4, s16
; VI-NEXT: v_mov_b32_e32 v5, s17
; VI-NEXT: v_mov_b32_e32 v6, s18
@@ -77697,7 +77697,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_mov_b32_e32 v16, s28
; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v63, s67, 19
+; VI-NEXT: v_writelane_b32 v63, s31, 19
; VI-NEXT: v_readfirstlane_b32 s18, v4
; VI-NEXT: v_readfirstlane_b32 s19, v5
; VI-NEXT: v_readfirstlane_b32 s16, v6
@@ -78106,26 +78106,26 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT: v_readlane_b32 s67, v63, 19
-; VI-NEXT: v_readlane_b32 s66, v63, 18
-; VI-NEXT: v_readlane_b32 s65, v63, 17
-; VI-NEXT: v_readlane_b32 s64, v63, 16
-; VI-NEXT: v_readlane_b32 s55, v63, 15
-; VI-NEXT: v_readlane_b32 s54, v63, 14
-; VI-NEXT: v_readlane_b32 s53, v63, 13
-; VI-NEXT: v_readlane_b32 s52, v63, 12
-; VI-NEXT: v_readlane_b32 s51, v63, 11
-; VI-NEXT: v_readlane_b32 s50, v63, 10
-; VI-NEXT: v_readlane_b32 s49, v63, 9
-; VI-NEXT: v_readlane_b32 s48, v63, 8
-; VI-NEXT: v_readlane_b32 s39, v63, 7
-; VI-NEXT: v_readlane_b32 s38, v63, 6
-; VI-NEXT: v_readlane_b32 s37, v63, 5
-; VI-NEXT: v_readlane_b32 s36, v63, 4
-; VI-NEXT: v_readlane_b32 s35, v63, 3
-; VI-NEXT: v_readlane_b32 s34, v63, 2
-; VI-NEXT: v_readlane_b32 s31, v63, 1
-; VI-NEXT: v_readlane_b32 s30, v63, 0
+; VI-NEXT: v_readlane_b32 s30, v63, 18
+; VI-NEXT: v_readlane_b32 s31, v63, 19
+; VI-NEXT: v_readlane_b32 s67, v63, 17
+; VI-NEXT: v_readlane_b32 s66, v63, 16
+; VI-NEXT: v_readlane_b32 s65, v63, 15
+; VI-NEXT: v_readlane_b32 s64, v63, 14
+; VI-NEXT: v_readlane_b32 s55, v63, 13
+; VI-NEXT: v_readlane_b32 s54, v63, 12
+; VI-NEXT: v_readlane_b32 s53, v63, 11
+; VI-NEXT: v_readlane_b32 s52, v63, 10
+; VI-NEXT: v_readlane_b32 s51, v63, 9
+; VI-NEXT: v_readlane_b32 s50, v63, 8
+; VI-NEXT: v_readlane_b32 s49, v63, 7
+; VI-NEXT: v_readlane_b32 s48, v63, 6
+; VI-NEXT: v_readlane_b32 s39, v63, 5
+; VI-NEXT: v_readlane_b32 s38, v63, 4
+; VI-NEXT: v_readlane_b32 s37, v63, 3
+; VI-NEXT: v_readlane_b32 s36, v63, 2
+; VI-NEXT: v_readlane_b32 s35, v63, 1
+; VI-NEXT: v_readlane_b32 s34, v63, 0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_perm_b32 v2, v29, v2, s4
; VI-NEXT: v_perm_b32 v1, v50, v1, s4
@@ -78474,18 +78474,18 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-NEXT: s_cmp_lg_u32 s28, 0
; GFX11-NEXT: s_mov_b32 s42, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-NEXT: v_writelane_b32 v40, s35, 1
+; GFX11-NEXT: v_writelane_b32 v40, s36, 2
+; GFX11-NEXT: v_writelane_b32 v40, s37, 3
+; GFX11-NEXT: v_writelane_b32 v40, s38, 4
+; GFX11-NEXT: v_writelane_b32 v40, s39, 5
+; GFX11-NEXT: v_writelane_b32 v40, s48, 6
+; GFX11-NEXT: v_writelane_b32 v40, s49, 7
+; GFX11-NEXT: v_writelane_b32 v40, s30, 8
+; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_cbranch_scc0 .LBB105_3
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_lshr_b32 s43, s27, 24
@@ -78753,21 +78753,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; GFX11-NEXT: v_or_b32_e32 v2, v4, v17
; GFX11-NEXT: v_or_b32_e32 v3, v19, v15
; GFX11-NEXT: v_or_b32_e32 v4, v16, v18
+; GFX11-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
+; GFX11-NEXT: v_readlane_b32 s49, v40, 7
+; GFX11-NEXT: v_readlane_b32 s48, v40, 6
+; GFX11-NEXT: v_readlane_b32 s39, v40, 5
+; GFX11-NEXT: v_readlane_b32 s38, v40, 4
+; GFX11-NEXT: v_readlane_b32 s37, v40, 3
+; GFX11-NEXT: v_readlane_b32 s36, v40, 2
+; GFX11-NEXT: v_readlane_b32 s35, v40, 1
+; GFX11-NEXT: v_readlane_b32 s34, v40, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
@@ -78790,7 +78790,7 @@ end:
ret <64 x i8> %phi
}
-define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
+define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -81269,7 +81269,7 @@ end:
ret <32 x half> %phi
}
-define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v32f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -81286,49 +81286,49 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v40, s30, 0
-; SI-NEXT: v_writelane_b32 v40, s31, 1
-; SI-NEXT: v_writelane_b32 v40, s34, 2
-; SI-NEXT: v_writelane_b32 v40, s35, 3
-; SI-NEXT: v_writelane_b32 v40, s36, 4
-; SI-NEXT: v_writelane_b32 v40, s37, 5
-; SI-NEXT: v_writelane_b32 v40, s38, 6
-; SI-NEXT: v_writelane_b32 v40, s39, 7
-; SI-NEXT: v_writelane_b32 v40, s48, 8
-; SI-NEXT: v_writelane_b32 v40, s49, 9
-; SI-NEXT: v_writelane_b32 v40, s50, 10
-; SI-NEXT: v_writelane_b32 v40, s51, 11
-; SI-NEXT: v_writelane_b32 v40, s52, 12
-; SI-NEXT: v_writelane_b32 v40, s53, 13
-; SI-NEXT: v_writelane_b32 v40, s54, 14
-; SI-NEXT: v_writelane_b32 v40, s55, 15
-; SI-NEXT: v_writelane_b32 v40, s64, 16
-; SI-NEXT: v_writelane_b32 v40, s65, 17
-; SI-NEXT: v_writelane_b32 v40, s66, 18
-; SI-NEXT: v_writelane_b32 v40, s67, 19
-; SI-NEXT: v_writelane_b32 v40, s68, 20
-; SI-NEXT: v_writelane_b32 v40, s69, 21
-; SI-NEXT: v_writelane_b32 v40, s70, 22
-; SI-NEXT: v_writelane_b32 v40, s71, 23
-; SI-NEXT: v_writelane_b32 v40, s80, 24
-; SI-NEXT: v_writelane_b32 v40, s81, 25
-; SI-NEXT: v_writelane_b32 v40, s82, 26
-; SI-NEXT: v_writelane_b32 v40, s83, 27
-; SI-NEXT: v_writelane_b32 v40, s84, 28
-; SI-NEXT: v_writelane_b32 v40, s85, 29
-; SI-NEXT: v_writelane_b32 v40, s86, 30
-; SI-NEXT: v_writelane_b32 v40, s87, 31
-; SI-NEXT: v_writelane_b32 v40, s96, 32
+; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
+; SI-NEXT: v_writelane_b32 v40, s96, 30
; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v40, s97, 33
+; SI-NEXT: v_writelane_b32 v40, s97, 31
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v41, s28, 0
; SI-NEXT: v_writelane_b32 v41, s26, 1
+; SI-NEXT: v_writelane_b32 v40, s98, 32
; SI-NEXT: v_writelane_b32 v41, s23, 2
+; SI-NEXT: v_writelane_b32 v40, s99, 33
; SI-NEXT: v_writelane_b32 v41, s22, 3
-; SI-NEXT: v_writelane_b32 v40, s98, 34
+; SI-NEXT: v_writelane_b32 v40, s30, 34
; SI-NEXT: v_writelane_b32 v41, s21, 4
-; SI-NEXT: v_writelane_b32 v40, s99, 35
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: s_mov_b32 s88, s29
; SI-NEXT: s_mov_b32 s30, s25
; SI-NEXT: s_mov_b32 s29, s24
@@ -81986,6 +81986,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_lshl_b32 s20, s45, 16
; SI-NEXT: s_or_b32 s5, s5, s20
+; SI-NEXT: v_readlane_b32 s30, v40, 34
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
@@ -82002,42 +82003,41 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v13, s7
; SI-NEXT: v_mov_b32_e32 v14, s4
; SI-NEXT: v_mov_b32_e32 v15, s5
-; SI-NEXT: v_readlane_b32 s99, v40, 35
-; SI-NEXT: v_readlane_b32 s98, v40, 34
-; SI-NEXT: v_readlane_b32 s97, v40, 33
-; SI-NEXT: v_readlane_b32 s96, v40, 32
-; SI-NEXT: v_readlane_b32 s87, v40, 31
-; SI-NEXT: v_readlane_b32 s86, v40, 30
-; SI-NEXT: v_readlane_b32 s85, v40, 29
-; SI-NEXT: v_readlane_b32 s84, v40, 28
-; SI-NEXT: v_readlane_b32 s83, v40, 27
-; SI-NEXT: v_readlane_b32 s82, v40, 26
-; SI-NEXT: v_readlane_b32 s81, v40, 25
-; SI-NEXT: v_readlane_b32 s80, v40, 24
-; SI-NEXT: v_readlane_b32 s71, v40, 23
-; SI-NEXT: v_readlane_b32 s70, v40, 22
-; SI-NEXT: v_readlane_b32 s69, v40, 21
-; SI-NEXT: v_readlane_b32 s68, v40, 20
-; SI-NEXT: v_readlane_b32 s67, v40, 19
-; SI-NEXT: v_readlane_b32 s66, v40, 18
-; SI-NEXT: v_readlane_b32 s65, v40, 17
-; SI-NEXT: v_readlane_b32 s64, v40, 16
-; SI-NEXT: v_readlane_b32 s55, v40, 15
-; SI-NEXT: v_readlane_b32 s54, v40, 14
-; SI-NEXT: v_readlane_b32 s53, v40, 13
-; SI-NEXT: v_readlane_b32 s52, v40, 12
-; SI-NEXT: v_readlane_b32 s51, v40, 11
-; SI-NEXT: v_readlane_b32 s50, v40, 10
-; SI-NEXT: v_readlane_b32 s49, v40, 9
-; SI-NEXT: v_readlane_b32 s48, v40, 8
-; SI-NEXT: v_readlane_b32 s39, v40, 7
-; SI-NEXT: v_readlane_b32 s38, v40, 6
-; SI-NEXT: v_readlane_b32 s37, v40, 5
-; SI-NEXT: v_readlane_b32 s36, v40, 4
-; SI-NEXT: v_readlane_b32 s35, v40, 3
-; SI-NEXT: v_readlane_b32 s34, v40, 2
-; SI-NEXT: v_readlane_b32 s31, v40, 1
-; SI-NEXT: v_readlane_b32 s30, v40, 0
+; SI-NEXT: v_readlane_b32 s31, v40, 35
+; SI-NEXT: v_readlane_b32 s99, v40, 33
+; SI-NEXT: v_readlane_b32 s98, v40, 32
+; SI-NEXT: v_readlane_b32 s97, v40, 31
+; SI-NEXT: v_readlane_b32 s96, v40, 30
+; SI-NEXT: v_readlane_b32 s87, v40, 29
+; SI-NEXT: v_readlane_b32 s86, v40, 28
+; SI-NEXT: v_readlane_b32 s85, v40, 27
+; SI-NEXT: v_readlane_b32 s84, v40, 26
+; SI-NEXT: v_readlane_b32 s83, v40, 25
+; SI-NEXT: v_readlane_b32 s82, v40, 24
+; SI-NEXT: v_readlane_b32 s81, v40, 23
+; SI-NEXT: v_readlane_b32 s80, v40, 22
+; SI-NEXT: v_readlane_b32 s71, v40, 21
+; SI-NEXT: v_readlane_b32 s70, v40, 20
+; SI-NEXT: v_readlane_b32 s69, v40, 19
+; SI-NEXT: v_readlane_b32 s68, v40, 18
+; SI-NEXT: v_readlane_b32 s67, v40, 17
+; SI-NEXT: v_readlane_b32 s66, v40, 16
+; SI-NEXT: v_readlane_b32 s65, v40, 15
+; SI-NEXT: v_readlane_b32 s64, v40, 14
+; SI-NEXT: v_readlane_b32 s55, v40, 13
+; SI-NEXT: v_readlane_b32 s54, v40, 12
+; SI-NEXT: v_readlane_b32 s53, v40, 11
+; SI-NEXT: v_readlane_b32 s52, v40, 10
+; SI-NEXT: v_readlane_b32 s51, v40, 9
+; SI-NEXT: v_readlane_b32 s50, v40, 8
+; SI-NEXT: v_readlane_b32 s49, v40, 7
+; SI-NEXT: v_readlane_b32 s48, v40, 6
+; SI-NEXT: v_readlane_b32 s39, v40, 5
+; SI-NEXT: v_readlane_b32 s38, v40, 4
+; SI-NEXT: v_readlane_b32 s37, v40, 3
+; SI-NEXT: v_readlane_b32 s36, v40, 2
+; SI-NEXT: v_readlane_b32 s35, v40, 1
+; SI-NEXT: v_readlane_b32 s34, v40, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -83316,7 +83316,7 @@ end:
ret <32 x half> %phi
}
-define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
+define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -86292,7 +86292,7 @@ end:
ret <64 x i8> %phi
}
-define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -86301,40 +86301,40 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v40, s30, 0
-; SI-NEXT: v_writelane_b32 v40, s31, 1
-; SI-NEXT: v_writelane_b32 v40, s34, 2
-; SI-NEXT: v_writelane_b32 v40, s35, 3
-; SI-NEXT: v_writelane_b32 v40, s36, 4
-; SI-NEXT: v_writelane_b32 v40, s37, 5
-; SI-NEXT: v_writelane_b32 v40, s38, 6
-; SI-NEXT: v_writelane_b32 v40, s39, 7
-; SI-NEXT: v_writelane_b32 v40, s48, 8
-; SI-NEXT: v_writelane_b32 v40, s49, 9
-; SI-NEXT: v_writelane_b32 v40, s50, 10
-; SI-NEXT: v_writelane_b32 v40, s51, 11
-; SI-NEXT: v_writelane_b32 v40, s52, 12
-; SI-NEXT: v_writelane_b32 v40, s53, 13
-; SI-NEXT: v_writelane_b32 v40, s54, 14
-; SI-NEXT: v_writelane_b32 v40, s55, 15
-; SI-NEXT: v_writelane_b32 v40, s64, 16
-; SI-NEXT: v_writelane_b32 v40, s65, 17
-; SI-NEXT: v_writelane_b32 v40, s66, 18
-; SI-NEXT: v_writelane_b32 v40, s67, 19
-; SI-NEXT: v_writelane_b32 v40, s68, 20
-; SI-NEXT: v_writelane_b32 v40, s69, 21
-; SI-NEXT: v_writelane_b32 v40, s70, 22
-; SI-NEXT: v_writelane_b32 v40, s71, 23
-; SI-NEXT: v_writelane_b32 v40, s80, 24
-; SI-NEXT: v_writelane_b32 v40, s81, 25
-; SI-NEXT: v_writelane_b32 v40, s82, 26
-; SI-NEXT: v_writelane_b32 v40, s83, 27
-; SI-NEXT: v_writelane_b32 v40, s84, 28
-; SI-NEXT: v_writelane_b32 v40, s85, 29
-; SI-NEXT: v_writelane_b32 v40, s86, 30
-; SI-NEXT: v_writelane_b32 v40, s87, 31
-; SI-NEXT: v_writelane_b32 v40, s96, 32
-; SI-NEXT: v_writelane_b32 v40, s97, 33
+; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
+; SI-NEXT: v_writelane_b32 v40, s96, 30
+; SI-NEXT: v_writelane_b32 v40, s97, 31
+; SI-NEXT: v_writelane_b32 v40, s98, 32
+; SI-NEXT: v_writelane_b32 v40, s99, 33
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
@@ -86368,7 +86368,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: s_and_b32 s43, s16, 0xffff0000
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_writelane_b32 v40, s98, 34
+; SI-NEXT: v_writelane_b32 v40, s30, 34
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s43
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16
@@ -86402,7 +86402,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_mul_f32_e64 v32, 1.0, s9
; SI-NEXT: v_mul_f32_e64 v27, 1.0, s6
; SI-NEXT: v_mul_f32_e64 v29, 1.0, s7
-; SI-NEXT: v_writelane_b32 v40, s99, 35
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB109_4
; SI-NEXT: ; %bb.1: ; %cmp.false
@@ -86899,44 +86899,44 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: v_or_b32_e32 v1, s4, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT: v_readlane_b32 s30, v40, 34
; SI-NEXT: v_readlane_b32 s75, v41, 1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s99, v40, 35
-; SI-NEXT: v_readlane_b32 s98, v40, 34
-; SI-NEXT: v_readlane_b32 s97, v40, 33
-; SI-NEXT: v_readlane_b32 s96, v40, 32
-; SI-NEXT: v_readlane_b32 s87, v40, 31
-; SI-NEXT: v_readlane_b32 s86, v40, 30
-; SI-NEXT: v_readlane_b32 s85, v40, 29
-; SI-NEXT: v_readlane_b32 s84, v40, 28
-; SI-NEXT: v_readlane_b32 s83, v40, 27
-; SI-NEXT: v_readlane_b32 s82, v40, 26
-; SI-NEXT: v_readlane_b32 s81, v40, 25
-; SI-NEXT: v_readlane_b32 s80, v40, 24
-; SI-NEXT: v_readlane_b32 s71, v40, 23
-; SI-NEXT: v_readlane_b32 s70, v40, 22
-; SI-NEXT: v_readlane_b32 s69, v40, 21
-; SI-NEXT: v_readlane_b32 s68, v40, 20
-; SI-NEXT: v_readlane_b32 s67, v40, 19
-; SI-NEXT: v_readlane_b32 s66, v40, 18
-; SI-NEXT: v_readlane_b32 s65, v40, 17
-; SI-NEXT: v_readlane_b32 s64, v40, 16
-; SI-NEXT: v_readlane_b32 s55, v40, 15
-; SI-NEXT: v_readlane_b32 s54, v40, 14
-; SI-NEXT: v_readlane_b32 s53, v40, 13
-; SI-NEXT: v_readlane_b32 s52, v40, 12
-; SI-NEXT: v_readlane_b32 s51, v40, 11
-; SI-NEXT: v_readlane_b32 s50, v40, 10
-; SI-NEXT: v_readlane_b32 s49, v40, 9
-; SI-NEXT: v_readlane_b32 s48, v40, 8
-; SI-NEXT: v_readlane_b32 s39, v40, 7
-; SI-NEXT: v_readlane_b32 s38, v40, 6
-; SI-NEXT: v_readlane_b32 s37, v40, 5
-; SI-NEXT: v_readlane_b32 s36, v40, 4
-; SI-NEXT: v_readlane_b32 s35, v40, 3
-; SI-NEXT: v_readlane_b32 s34, v40, 2
-; SI-NEXT: v_readlane_b32 s31, v40, 1
-; SI-NEXT: v_readlane_b32 s30, v40, 0
+; SI-NEXT: v_readlane_b32 s31, v40, 35
+; SI-NEXT: v_readlane_b32 s99, v40, 33
+; SI-NEXT: v_readlane_b32 s98, v40, 32
+; SI-NEXT: v_readlane_b32 s97, v40, 31
+; SI-NEXT: v_readlane_b32 s96, v40, 30
+; SI-NEXT: v_readlane_b32 s87, v40, 29
+; SI-NEXT: v_readlane_b32 s86, v40, 28
+; SI-NEXT: v_readlane_b32 s85, v40, 27
+; SI-NEXT: v_readlane_b32 s84, v40, 26
+; SI-NEXT: v_readlane_b32 s83, v40, 25
+; SI-NEXT: v_readlane_b32 s82, v40, 24
+; SI-NEXT: v_readlane_b32 s81, v40, 23
+; SI-NEXT: v_readlane_b32 s80, v40, 22
+; SI-NEXT: v_readlane_b32 s71, v40, 21
+; SI-NEXT: v_readlane_b32 s70, v40, 20
+; SI-NEXT: v_readlane_b32 s69, v40, 19
+; SI-NEXT: v_readlane_b32 s68, v40, 18
+; SI-NEXT: v_readlane_b32 s67, v40, 17
+; SI-NEXT: v_readlane_b32 s66, v40, 16
+; SI-NEXT: v_readlane_b32 s65, v40, 15
+; SI-NEXT: v_readlane_b32 s64, v40, 14
+; SI-NEXT: v_readlane_b32 s55, v40, 13
+; SI-NEXT: v_readlane_b32 s54, v40, 12
+; SI-NEXT: v_readlane_b32 s53, v40, 11
+; SI-NEXT: v_readlane_b32 s52, v40, 10
+; SI-NEXT: v_readlane_b32 s51, v40, 9
+; SI-NEXT: v_readlane_b32 s50, v40, 8
+; SI-NEXT: v_readlane_b32 s49, v40, 7
+; SI-NEXT: v_readlane_b32 s48, v40, 6
+; SI-NEXT: v_readlane_b32 s39, v40, 5
+; SI-NEXT: v_readlane_b32 s38, v40, 4
+; SI-NEXT: v_readlane_b32 s37, v40, 3
+; SI-NEXT: v_readlane_b32 s36, v40, 2
+; SI-NEXT: v_readlane_b32 s35, v40, 1
+; SI-NEXT: v_readlane_b32 s34, v40, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -87019,25 +87019,25 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v63, s30, 0
-; VI-NEXT: v_writelane_b32 v63, s31, 1
-; VI-NEXT: v_writelane_b32 v63, s34, 2
-; VI-NEXT: v_writelane_b32 v63, s35, 3
-; VI-NEXT: v_writelane_b32 v63, s36, 4
-; VI-NEXT: v_writelane_b32 v63, s37, 5
-; VI-NEXT: v_writelane_b32 v63, s38, 6
-; VI-NEXT: v_writelane_b32 v63, s39, 7
-; VI-NEXT: v_writelane_b32 v63, s48, 8
-; VI-NEXT: v_writelane_b32 v63, s49, 9
-; VI-NEXT: v_writelane_b32 v63, s50, 10
-; VI-NEXT: v_writelane_b32 v63, s51, 11
-; VI-NEXT: v_writelane_b32 v63, s52, 12
-; VI-NEXT: v_writelane_b32 v63, s53, 13
-; VI-NEXT: v_writelane_b32 v63, s54, 14
-; VI-NEXT: v_writelane_b32 v63, s55, 15
-; VI-NEXT: v_writelane_b32 v63, s64, 16
-; VI-NEXT: v_writelane_b32 v63, s65, 17
-; VI-NEXT: v_writelane_b32 v63, s66, 18
+; VI-NEXT: v_writelane_b32 v63, s34, 0
+; VI-NEXT: v_writelane_b32 v63, s35, 1
+; VI-NEXT: v_writelane_b32 v63, s36, 2
+; VI-NEXT: v_writelane_b32 v63, s37, 3
+; VI-NEXT: v_writelane_b32 v63, s38, 4
+; VI-NEXT: v_writelane_b32 v63, s39, 5
+; VI-NEXT: v_writelane_b32 v63, s48, 6
+; VI-NEXT: v_writelane_b32 v63, s49, 7
+; VI-NEXT: v_writelane_b32 v63, s50, 8
+; VI-NEXT: v_writelane_b32 v63, s51, 9
+; VI-NEXT: v_writelane_b32 v63, s52, 10
+; VI-NEXT: v_writelane_b32 v63, s53, 11
+; VI-NEXT: v_writelane_b32 v63, s54, 12
+; VI-NEXT: v_writelane_b32 v63, s55, 13
+; VI-NEXT: v_writelane_b32 v63, s64, 14
+; VI-NEXT: v_writelane_b32 v63, s65, 15
+; VI-NEXT: v_writelane_b32 v63, s66, 16
+; VI-NEXT: v_writelane_b32 v63, s67, 17
+; VI-NEXT: v_writelane_b32 v63, s30, 18
; VI-NEXT: v_mov_b32_e32 v4, s16
; VI-NEXT: v_mov_b32_e32 v5, s17
; VI-NEXT: v_mov_b32_e32 v6, s18
@@ -87053,7 +87053,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_mov_b32_e32 v16, s28
; VI-NEXT: v_mov_b32_e32 v17, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v63, s67, 19
+; VI-NEXT: v_writelane_b32 v63, s31, 19
; VI-NEXT: v_readfirstlane_b32 s18, v4
; VI-NEXT: v_readfirstlane_b32 s19, v5
; VI-NEXT: v_readfirstlane_b32 s16, v6
@@ -87698,26 +87698,26 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: v_perm_b32 v1, v22, v9, s4
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
-; VI-NEXT: v_readlane_b32 s67, v63, 19
-; VI-NEXT: v_readlane_b32 s66, v63, 18
-; VI-NEXT: v_readlane_b32 s65, v63, 17
-; VI-NEXT: v_readlane_b32 s64, v63, 16
-; VI-NEXT: v_readlane_b32 s55, v63, 15
-; VI-NEXT: v_readlane_b32 s54, v63, 14
-; VI-NEXT: v_readlane_b32 s53, v63, 13
-; VI-NEXT: v_readlane_b32 s52, v63, 12
-; VI-NEXT: v_readlane_b32 s51, v63, 11
-; VI-NEXT: v_readlane_b32 s50, v63, 10
-; VI-NEXT: v_readlane_b32 s49, v63, 9
-; VI-NEXT: v_readlane_b32 s48, v63, 8
-; VI-NEXT: v_readlane_b32 s39, v63, 7
-; VI-NEXT: v_readlane_b32 s38, v63, 6
-; VI-NEXT: v_readlane_b32 s37, v63, 5
-; VI-NEXT: v_readlane_b32 s36, v63, 4
-; VI-NEXT: v_readlane_b32 s35, v63, 3
-; VI-NEXT: v_readlane_b32 s34, v63, 2
-; VI-NEXT: v_readlane_b32 s31, v63, 1
-; VI-NEXT: v_readlane_b32 s30, v63, 0
+; VI-NEXT: v_readlane_b32 s30, v63, 18
+; VI-NEXT: v_readlane_b32 s31, v63, 19
+; VI-NEXT: v_readlane_b32 s67, v63, 17
+; VI-NEXT: v_readlane_b32 s66, v63, 16
+; VI-NEXT: v_readlane_b32 s65, v63, 15
+; VI-NEXT: v_readlane_b32 s64, v63, 14
+; VI-NEXT: v_readlane_b32 s55, v63, 13
+; VI-NEXT: v_readlane_b32 s54, v63, 12
+; VI-NEXT: v_readlane_b32 s53, v63, 11
+; VI-NEXT: v_readlane_b32 s52, v63, 10
+; VI-NEXT: v_readlane_b32 s51, v63, 9
+; VI-NEXT: v_readlane_b32 s50, v63, 8
+; VI-NEXT: v_readlane_b32 s49, v63, 7
+; VI-NEXT: v_readlane_b32 s48, v63, 6
+; VI-NEXT: v_readlane_b32 s39, v63, 5
+; VI-NEXT: v_readlane_b32 s38, v63, 4
+; VI-NEXT: v_readlane_b32 s37, v63, 3
+; VI-NEXT: v_readlane_b32 s36, v63, 2
+; VI-NEXT: v_readlane_b32 s35, v63, 1
+; VI-NEXT: v_readlane_b32 s34, v63, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v2, v3, v2, s4
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -87750,21 +87750,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v63, s30, 0
-; GFX9-NEXT: v_writelane_b32 v63, s31, 1
-; GFX9-NEXT: v_writelane_b32 v63, s34, 2
-; GFX9-NEXT: v_writelane_b32 v63, s35, 3
-; GFX9-NEXT: v_writelane_b32 v63, s36, 4
-; GFX9-NEXT: v_writelane_b32 v63, s37, 5
-; GFX9-NEXT: v_writelane_b32 v63, s38, 6
-; GFX9-NEXT: v_writelane_b32 v63, s39, 7
-; GFX9-NEXT: v_writelane_b32 v63, s48, 8
-; GFX9-NEXT: v_writelane_b32 v63, s49, 9
-; GFX9-NEXT: v_writelane_b32 v63, s50, 10
-; GFX9-NEXT: v_writelane_b32 v63, s51, 11
-; GFX9-NEXT: v_writelane_b32 v63, s52, 12
-; GFX9-NEXT: v_writelane_b32 v63, s53, 13
-; GFX9-NEXT: v_writelane_b32 v63, s54, 14
+; GFX9-NEXT: v_writelane_b32 v63, s34, 0
+; GFX9-NEXT: v_writelane_b32 v63, s35, 1
+; GFX9-NEXT: v_writelane_b32 v63, s36, 2
+; GFX9-NEXT: v_writelane_b32 v63, s37, 3
+; GFX9-NEXT: v_writelane_b32 v63, s38, 4
+; GFX9-NEXT: v_writelane_b32 v63, s39, 5
+; GFX9-NEXT: v_writelane_b32 v63, s48, 6
+; GFX9-NEXT: v_writelane_b32 v63, s49, 7
+; GFX9-NEXT: v_writelane_b32 v63, s50, 8
+; GFX9-NEXT: v_writelane_b32 v63, s51, 9
+; GFX9-NEXT: v_writelane_b32 v63, s52, 10
+; GFX9-NEXT: v_writelane_b32 v63, s53, 11
+; GFX9-NEXT: v_writelane_b32 v63, s54, 12
+; GFX9-NEXT: v_writelane_b32 v63, s55, 13
+; GFX9-NEXT: v_writelane_b32 v63, s30, 14
; GFX9-NEXT: v_mov_b32_e32 v4, s16
; GFX9-NEXT: v_mov_b32_e32 v5, s17
; GFX9-NEXT: v_mov_b32_e32 v6, s18
@@ -87780,7 +87780,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v16, s28
; GFX9-NEXT: v_mov_b32_e32 v17, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_writelane_b32 v63, s55, 15
+; GFX9-NEXT: v_writelane_b32 v63, s31, 15
; GFX9-NEXT: v_readfirstlane_b32 s18, v4
; GFX9-NEXT: v_readfirstlane_b32 s19, v5
; GFX9-NEXT: v_readfirstlane_b32 s16, v6
@@ -88359,22 +88359,22 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_perm_b32 v2, v53, v2, s4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_readlane_b32 s55, v63, 15
-; GFX9-NEXT: v_readlane_b32 s54, v63, 14
-; GFX9-NEXT: v_readlane_b32 s53, v63, 13
-; GFX9-NEXT: v_readlane_b32 s52, v63, 12
-; GFX9-NEXT: v_readlane_b32 s51, v63, 11
-; GFX9-NEXT: v_readlane_b32 s50, v63, 10
-; GFX9-NEXT: v_readlane_b32 s49, v63, 9
-; GFX9-NEXT: v_readlane_b32 s48, v63, 8
-; GFX9-NEXT: v_readlane_b32 s39, v63, 7
-; GFX9-NEXT: v_readlane_b32 s38, v63, 6
-; GFX9-NEXT: v_readlane_b32 s37, v63, 5
-; GFX9-NEXT: v_readlane_b32 s36, v63, 4
-; GFX9-NEXT: v_readlane_b32 s35, v63, 3
-; GFX9-NEXT: v_readlane_b32 s34, v63, 2
-; GFX9-NEXT: v_readlane_b32 s31, v63, 1
-; GFX9-NEXT: v_readlane_b32 s30, v63, 0
+; GFX9-NEXT: v_readlane_b32 s30, v63, 14
+; GFX9-NEXT: v_readlane_b32 s31, v63, 15
+; GFX9-NEXT: v_readlane_b32 s55, v63, 13
+; GFX9-NEXT: v_readlane_b32 s54, v63, 12
+; GFX9-NEXT: v_readlane_b32 s53, v63, 11
+; GFX9-NEXT: v_readlane_b32 s52, v63, 10
+; GFX9-NEXT: v_readlane_b32 s51, v63, 9
+; GFX9-NEXT: v_readlane_b32 s50, v63, 8
+; GFX9-NEXT: v_readlane_b32 s49, v63, 7
+; GFX9-NEXT: v_readlane_b32 s48, v63, 6
+; GFX9-NEXT: v_readlane_b32 s39, v63, 5
+; GFX9-NEXT: v_readlane_b32 s38, v63, 4
+; GFX9-NEXT: v_readlane_b32 s37, v63, 3
+; GFX9-NEXT: v_readlane_b32 s36, v63, 2
+; GFX9-NEXT: v_readlane_b32 s35, v63, 1
+; GFX9-NEXT: v_readlane_b32 s34, v63, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v3, v3, v4, s4
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
@@ -88475,18 +88475,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s27, 24
@@ -89049,21 +89049,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v20, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v22
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:16
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[16:19], off offset:32
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 9
+; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 0
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0
@@ -89076,18 +89076,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s27, 24
@@ -89653,21 +89653,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v19, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v13, v20
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 8
; GFX11-FAKE16-NEXT: s_clause 0x3
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:16
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:32
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 9
+; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 0
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0
@@ -89690,7 +89690,7 @@ end:
ret <64 x i8> %phi
}
-define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
+define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -92181,7 +92181,7 @@ end:
ret <32 x bfloat> %phi
}
-define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, i32 inreg %b) {
+define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v64i8_to_v32bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -92198,41 +92198,39 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v40, s30, 0
-; SI-NEXT: v_writelane_b32 v40, s31, 1
-; SI-NEXT: v_writelane_b32 v40, s34, 2
-; SI-NEXT: v_writelane_b32 v40, s35, 3
-; SI-NEXT: v_writelane_b32 v40, s36, 4
-; SI-NEXT: v_writelane_b32 v40, s37, 5
-; SI-NEXT: v_writelane_b32 v40, s38, 6
-; SI-NEXT: v_writelane_b32 v40, s39, 7
-; SI-NEXT: v_writelane_b32 v40, s48, 8
-; SI-NEXT: v_writelane_b32 v40, s49, 9
-; SI-NEXT: v_writelane_b32 v40, s50, 10
-; SI-NEXT: v_writelane_b32 v40, s51, 11
-; SI-NEXT: v_writelane_b32 v40, s52, 12
-; SI-NEXT: v_writelane_b32 v40, s53, 13
-; SI-NEXT: v_writelane_b32 v40, s54, 14
-; SI-NEXT: v_writelane_b32 v40, s55, 15
-; SI-NEXT: v_writelane_b32 v40, s64, 16
-; SI-NEXT: v_writelane_b32 v40, s65, 17
-; SI-NEXT: v_writelane_b32 v40, s66, 18
-; SI-NEXT: v_writelane_b32 v40, s67, 19
-; SI-NEXT: v_writelane_b32 v40, s68, 20
-; SI-NEXT: v_writelane_b32 v40, s69, 21
-; SI-NEXT: v_writelane_b32 v40, s70, 22
-; SI-NEXT: v_writelane_b32 v40, s71, 23
-; SI-NEXT: v_writelane_b32 v40, s80, 24
-; SI-NEXT: v_writelane_b32 v40, s81, 25
-; SI-NEXT: v_writelane_b32 v40, s82, 26
-; SI-NEXT: v_writelane_b32 v40, s83, 27
-; SI-NEXT: v_writelane_b32 v40, s84, 28
-; SI-NEXT: v_writelane_b32 v40, s85, 29
-; SI-NEXT: v_writelane_b32 v40, s86, 30
-; SI-NEXT: v_writelane_b32 v40, s87, 31
+; SI-NEXT: v_writelane_b32 v40, s34, 0
+; SI-NEXT: v_writelane_b32 v40, s35, 1
+; SI-NEXT: v_writelane_b32 v40, s36, 2
+; SI-NEXT: v_writelane_b32 v40, s37, 3
+; SI-NEXT: v_writelane_b32 v40, s38, 4
+; SI-NEXT: v_writelane_b32 v40, s39, 5
+; SI-NEXT: v_writelane_b32 v40, s48, 6
+; SI-NEXT: v_writelane_b32 v40, s49, 7
+; SI-NEXT: v_writelane_b32 v40, s50, 8
+; SI-NEXT: v_writelane_b32 v40, s51, 9
+; SI-NEXT: v_writelane_b32 v40, s52, 10
+; SI-NEXT: v_writelane_b32 v40, s53, 11
+; SI-NEXT: v_writelane_b32 v40, s54, 12
+; SI-NEXT: v_writelane_b32 v40, s55, 13
+; SI-NEXT: v_writelane_b32 v40, s64, 14
+; SI-NEXT: v_writelane_b32 v40, s65, 15
+; SI-NEXT: v_writelane_b32 v40, s66, 16
+; SI-NEXT: v_writelane_b32 v40, s67, 17
+; SI-NEXT: v_writelane_b32 v40, s68, 18
+; SI-NEXT: v_writelane_b32 v40, s69, 19
+; SI-NEXT: v_writelane_b32 v40, s70, 20
+; SI-NEXT: v_writelane_b32 v40, s71, 21
+; SI-NEXT: v_writelane_b32 v40, s80, 22
+; SI-NEXT: v_writelane_b32 v40, s81, 23
+; SI-NEXT: v_writelane_b32 v40, s82, 24
+; SI-NEXT: v_writelane_b32 v40, s83, 25
+; SI-NEXT: v_writelane_b32 v40, s84, 26
+; SI-NEXT: v_writelane_b32 v40, s85, 27
+; SI-NEXT: v_writelane_b32 v40, s86, 28
+; SI-NEXT: v_writelane_b32 v40, s87, 29
; SI-NEXT: s_mov_b32 s6, s16
; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v40, s96, 32
+; SI-NEXT: v_writelane_b32 v40, s96, 30
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v41, s18, 0
; SI-NEXT: v_writelane_b32 v41, s19, 1
@@ -92240,12 +92238,15 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_writelane_b32 v41, s17, 3
; SI-NEXT: v_writelane_b32 v41, s21, 4
; SI-NEXT: v_writelane_b32 v41, s22, 5
+; SI-NEXT: v_writelane_b32 v40, s97, 31
; SI-NEXT: v_writelane_b32 v41, s20, 6
+; SI-NEXT: v_writelane_b32 v40, s98, 32
; SI-NEXT: v_writelane_b32 v41, s25, 7
+; SI-NEXT: v_writelane_b32 v40, s99, 33
; SI-NEXT: v_writelane_b32 v41, s29, 8
-; SI-NEXT: v_writelane_b32 v40, s97, 33
+; SI-NEXT: v_writelane_b32 v40, s30, 34
; SI-NEXT: v_writelane_b32 v41, s24, 9
-; SI-NEXT: v_writelane_b32 v40, s98, 34
+; SI-NEXT: v_writelane_b32 v40, s31, 35
; SI-NEXT: v_readfirstlane_b32 s95, v30
; SI-NEXT: v_readfirstlane_b32 s88, v29
; SI-NEXT: v_readfirstlane_b32 s79, v28
@@ -92271,7 +92272,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s15, v8
; SI-NEXT: v_readfirstlane_b32 s43, v7
; SI-NEXT: v_readfirstlane_b32 s44, v6
-; SI-NEXT: v_readfirstlane_b32 s11, v5
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s87, v31
; SI-NEXT: s_waitcnt vmcnt(6)
@@ -92299,13 +92299,13 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s55, v38
+; SI-NEXT: v_readfirstlane_b32 s11, v5
; SI-NEXT: v_readfirstlane_b32 s10, v4
; SI-NEXT: v_readfirstlane_b32 s13, v3
; SI-NEXT: v_readfirstlane_b32 s14, v2
; SI-NEXT: v_readfirstlane_b32 s7, v1
; SI-NEXT: v_readfirstlane_b32 s12, v0
; SI-NEXT: v_writelane_b32 v41, s28, 10
-; SI-NEXT: v_writelane_b32 v40, s99, 35
; SI-NEXT: v_writelane_b32 v41, s7, 11
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s48, v31
@@ -92783,43 +92783,43 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s87
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s96
+; SI-NEXT: v_readlane_b32 s30, v40, 34
; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16
-; SI-NEXT: v_readlane_b32 s99, v40, 35
-; SI-NEXT: v_readlane_b32 s98, v40, 34
-; SI-NEXT: v_readlane_b32 s97, v40, 33
-; SI-NEXT: v_readlane_b32 s96, v40, 32
-; SI-NEXT: v_readlane_b32 s87, v40, 31
-; SI-NEXT: v_readlane_b32 s86, v40, 30
-; SI-NEXT: v_readlane_b32 s85, v40, 29
-; SI-NEXT: v_readlane_b32 s84, v40, 28
-; SI-NEXT: v_readlane_b32 s83, v40, 27
-; SI-NEXT: v_readlane_b32 s82, v40, 26
-; SI-NEXT: v_readlane_b32 s81, v40, 25
-; SI-NEXT: v_readlane_b32 s80, v40, 24
-; SI-NEXT: v_readlane_b32 s71, v40, 23
-; SI-NEXT: v_readlane_b32 s70, v40, 22
-; SI-NEXT: v_readlane_b32 s69, v40, 21
-; SI-NEXT: v_readlane_b32 s68, v40, 20
-; SI-NEXT: v_readlane_b32 s67, v40, 19
-; SI-NEXT: v_readlane_b32 s66, v40, 18
-; SI-NEXT: v_readlane_b32 s65, v40, 17
-; SI-NEXT: v_readlane_b32 s64, v40, 16
-; SI-NEXT: v_readlane_b32 s55, v40, 15
-; SI-NEXT: v_readlane_b32 s54, v40, 14
-; SI-NEXT: v_readlane_b32 s53, v40, 13
-; SI-NEXT: v_readlane_b32 s52, v40, 12
-; SI-NEXT: v_readlane_b32 s51, v40, 11
-; SI-NEXT: v_readlane_b32 s50, v40, 10
-; SI-NEXT: v_readlane_b32 s49, v40, 9
-; SI-NEXT: v_readlane_b32 s48, v40, 8
-; SI-NEXT: v_readlane_b32 s39, v40, 7
-; SI-NEXT: v_readlane_b32 s38, v40, 6
-; SI-NEXT: v_readlane_b32 s37, v40, 5
-; SI-NEXT: v_readlane_b32 s36, v40, 4
-; SI-NEXT: v_readlane_b32 s35, v40, 3
-; SI-NEXT: v_readlane_b32 s34, v40, 2
-; SI-NEXT: v_readlane_b32 s31, v40, 1
-; SI-NEXT: v_readlane_b32 s30, v40, 0
+; SI-NEXT: v_readlane_b32 s31, v40, 35
+; SI-NEXT: v_readlane_b32 s99, v40, 33
+; SI-NEXT: v_readlane_b32 s98, v40, 32
+; SI-NEXT: v_readlane_b32 s97, v40, 31
+; SI-NEXT: v_readlane_b32 s96, v40, 30
+; SI-NEXT: v_readlane_b32 s87, v40, 29
+; SI-NEXT: v_readlane_b32 s86, v40, 28
+; SI-NEXT: v_readlane_b32 s85, v40, 27
+; SI-NEXT: v_readlane_b32 s84, v40, 26
+; SI-NEXT: v_readlane_b32 s83, v40, 25
+; SI-NEXT: v_readlane_b32 s82, v40, 24
+; SI-NEXT: v_readlane_b32 s81, v40, 23
+; SI-NEXT: v_readlane_b32 s80, v40, 22
+; SI-NEXT: v_readlane_b32 s71, v40, 21
+; SI-NEXT: v_readlane_b32 s70, v40, 20
+; SI-NEXT: v_readlane_b32 s69, v40, 19
+; SI-NEXT: v_readlane_b32 s68, v40, 18
+; SI-NEXT: v_readlane_b32 s67, v40, 17
+; SI-NEXT: v_readlane_b32 s66, v40, 16
+; SI-NEXT: v_readlane_b32 s65, v40, 15
+; SI-NEXT: v_readlane_b32 s64, v40, 14
+; SI-NEXT: v_readlane_b32 s55, v40, 13
+; SI-NEXT: v_readlane_b32 s54, v40, 12
+; SI-NEXT: v_readlane_b32 s53, v40, 11
+; SI-NEXT: v_readlane_b32 s52, v40, 10
+; SI-NEXT: v_readlane_b32 s51, v40, 9
+; SI-NEXT: v_readlane_b32 s50, v40, 8
+; SI-NEXT: v_readlane_b32 s49, v40, 7
+; SI-NEXT: v_readlane_b32 s48, v40, 6
+; SI-NEXT: v_readlane_b32 s39, v40, 5
+; SI-NEXT: v_readlane_b32 s38, v40, 4
+; SI-NEXT: v_readlane_b32 s37, v40, 3
+; SI-NEXT: v_readlane_b32 s36, v40, 2
+; SI-NEXT: v_readlane_b32 s35, v40, 1
+; SI-NEXT: v_readlane_b32 s34, v40, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -94135,3 +94135,5 @@ end:
%phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <32 x bfloat> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 361a93919fed7..5b410a0326a3a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) {
+define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -148,7 +148,7 @@ end:
ret <18 x float> %phi
}
-define inreg <18 x float> @bitcast_v18i32_to_v18f32_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v18i32_to_v18f32_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v18f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -362,7 +362,7 @@ end:
ret <18 x float> %phi
}
-define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) {
+define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -495,7 +495,7 @@ end:
ret <18 x i32> %phi
}
-define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v18i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -762,7 +762,7 @@ end:
ret <18 x i32> %phi
}
-define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) {
+define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -904,7 +904,7 @@ end:
ret <9 x i64> %phi
}
-define inreg <9 x i64> @bitcast_v18i32_to_v9i64_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v18i32_to_v9i64_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v9i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1118,7 +1118,7 @@ end:
ret <9 x i64> %phi
}
-define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) {
+define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1265,7 +1265,7 @@ end:
ret <18 x i32> %phi
}
-define inreg <18 x i32> @bitcast_v9i64_to_v18i32_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v9i64_to_v18i32_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v18i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1479,7 +1479,7 @@ end:
ret <18 x i32> %phi
}
-define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) {
+define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1621,7 +1621,7 @@ end:
ret <9 x double> %phi
}
-define inreg <9 x double> @bitcast_v18i32_to_v9f64_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v18i32_to_v9f64_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v9f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1835,7 +1835,7 @@ end:
ret <9 x double> %phi
}
-define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) {
+define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1941,7 +1941,7 @@ end:
ret <18 x i32> %phi
}
-define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v18i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2172,7 +2172,7 @@ end:
ret <18 x i32> %phi
}
-define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) {
+define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v36i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2702,7 +2702,7 @@ end:
ret <36 x i16> %phi
}
-define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v36i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3369,7 +3369,7 @@ end:
ret <36 x i16> %phi
}
-define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) {
+define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4110,7 +4110,7 @@ end:
ret <18 x i32> %phi
}
-define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v18i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4717,7 +4717,7 @@ end:
ret <18 x i32> %phi
}
-define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
+define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5247,7 +5247,7 @@ end:
ret <36 x half> %phi
}
-define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18i32_to_v36f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5914,7 +5914,7 @@ end:
ret <36 x half> %phi
}
-define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
+define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6729,7 +6729,7 @@ end:
ret <18 x i32> %phi
}
-define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v18i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7351,7 +7351,7 @@ end:
ret <18 x i32> %phi
}
-define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) {
+define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7484,7 +7484,7 @@ end:
ret <9 x i64> %phi
}
-define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v9i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7751,7 +7751,7 @@ end:
ret <9 x i64> %phi
}
-define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) {
+define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7898,7 +7898,7 @@ end:
ret <18 x float> %phi
}
-define inreg <18 x float> @bitcast_v9i64_to_v18f32_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v9i64_to_v18f32_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v18f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8112,7 +8112,7 @@ end:
ret <18 x float> %phi
}
-define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) {
+define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8245,7 +8245,7 @@ end:
ret <9 x double> %phi
}
-define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v9f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8512,7 +8512,7 @@ end:
ret <9 x double> %phi
}
-define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) {
+define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8618,7 +8618,7 @@ end:
ret <18 x float> %phi
}
-define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v18f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8849,7 +8849,7 @@ end:
ret <18 x float> %phi
}
-define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) {
+define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v36i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9361,7 +9361,7 @@ end:
ret <36 x i16> %phi
}
-define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v36i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10096,7 +10096,7 @@ end:
ret <36 x i16> %phi
}
-define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) {
+define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10837,7 +10837,7 @@ end:
ret <18 x float> %phi
}
-define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v18f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11444,7 +11444,7 @@ end:
ret <18 x float> %phi
}
-define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
+define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11956,7 +11956,7 @@ end:
ret <36 x half> %phi
}
-define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v18f32_to_v36f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12691,7 +12691,7 @@ end:
ret <36 x half> %phi
}
-define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
+define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13506,7 +13506,7 @@ end:
ret <18 x float> %phi
}
-define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v18f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14128,7 +14128,7 @@ end:
ret <18 x float> %phi
}
-define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) {
+define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14275,7 +14275,7 @@ end:
ret <9 x double> %phi
}
-define inreg <9 x double> @bitcast_v9i64_to_v9f64_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v9i64_to_v9f64_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v9f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14488,7 +14488,7 @@ end:
ret <9 x double> %phi
}
-define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) {
+define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14594,7 +14594,7 @@ end:
ret <9 x i64> %phi
}
-define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v9i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14825,7 +14825,7 @@ end:
ret <9 x i64> %phi
}
-define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) {
+define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v36i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15365,7 +15365,7 @@ end:
ret <36 x i16> %phi
}
-define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v36i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16032,7 +16032,7 @@ end:
ret <36 x i16> %phi
}
-define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) {
+define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16773,7 +16773,7 @@ end:
ret <9 x i64> %phi
}
-define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v9i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17380,7 +17380,7 @@ end:
ret <9 x i64> %phi
}
-define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
+define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17920,7 +17920,7 @@ end:
ret <36 x half> %phi
}
-define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9i64_to_v36f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18587,7 +18587,7 @@ end:
ret <36 x half> %phi
}
-define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
+define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19402,7 +19402,7 @@ end:
ret <9 x i64> %phi
}
-define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v9i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20024,7 +20024,7 @@ end:
ret <9 x i64> %phi
}
-define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) {
+define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v36i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20509,7 +20509,7 @@ end:
ret <36 x i16> %phi
}
-define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v36i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21202,7 +21202,7 @@ end:
ret <36 x i16> %phi
}
-define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) {
+define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21943,7 +21943,7 @@ end:
ret <9 x double> %phi
}
-define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v9f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22550,7 +22550,7 @@ end:
ret <9 x double> %phi
}
-define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
+define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23035,7 +23035,7 @@ end:
ret <36 x half> %phi
}
-define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v9f64_to_v36f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23728,7 +23728,7 @@ end:
ret <36 x half> %phi
}
-define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
+define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24543,7 +24543,7 @@ end:
ret <9 x double> %phi
}
-define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v9f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25165,7 +25165,7 @@ end:
ret <9 x double> %phi
}
-define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
+define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25919,7 +25919,7 @@ end:
ret <36 x half> %phi
}
-define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i32 inreg %b) {
+define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36i16_to_v36f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25927,21 +25927,22 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v18, s30, 0
-; SI-NEXT: v_writelane_b32 v18, s31, 1
-; SI-NEXT: v_writelane_b32 v18, s34, 2
-; SI-NEXT: v_writelane_b32 v18, s35, 3
-; SI-NEXT: v_writelane_b32 v18, s36, 4
-; SI-NEXT: v_writelane_b32 v18, s37, 5
-; SI-NEXT: v_writelane_b32 v18, s38, 6
-; SI-NEXT: v_writelane_b32 v18, s39, 7
-; SI-NEXT: v_writelane_b32 v18, s48, 8
-; SI-NEXT: v_writelane_b32 v18, s49, 9
-; SI-NEXT: v_writelane_b32 v18, s50, 10
-; SI-NEXT: v_writelane_b32 v18, s51, 11
-; SI-NEXT: v_writelane_b32 v18, s52, 12
-; SI-NEXT: v_writelane_b32 v18, s53, 13
-; SI-NEXT: v_writelane_b32 v18, s54, 14
+; SI-NEXT: v_writelane_b32 v18, s34, 0
+; SI-NEXT: v_writelane_b32 v18, s35, 1
+; SI-NEXT: v_writelane_b32 v18, s36, 2
+; SI-NEXT: v_writelane_b32 v18, s37, 3
+; SI-NEXT: v_writelane_b32 v18, s38, 4
+; SI-NEXT: v_writelane_b32 v18, s39, 5
+; SI-NEXT: v_writelane_b32 v18, s48, 6
+; SI-NEXT: v_writelane_b32 v18, s49, 7
+; SI-NEXT: v_writelane_b32 v18, s50, 8
+; SI-NEXT: v_writelane_b32 v18, s51, 9
+; SI-NEXT: v_writelane_b32 v18, s52, 10
+; SI-NEXT: v_writelane_b32 v18, s53, 11
+; SI-NEXT: v_writelane_b32 v18, s54, 12
+; SI-NEXT: v_writelane_b32 v18, s55, 13
+; SI-NEXT: v_writelane_b32 v18, s30, 14
+; SI-NEXT: v_writelane_b32 v18, s31, 15
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
@@ -25961,7 +25962,6 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s90, s17, 16
; SI-NEXT: s_lshr_b32 s35, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT: v_writelane_b32 v18, s55, 15
; SI-NEXT: v_readfirstlane_b32 s52, v3
; SI-NEXT: v_readfirstlane_b32 s54, v2
; SI-NEXT: v_readfirstlane_b32 s50, v1
@@ -26210,6 +26210,7 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s21, s43, 0xffff
; SI-NEXT: s_lshl_b32 s22, s34, 16
; SI-NEXT: s_or_b32 s21, s21, s22
+; SI-NEXT: v_readlane_b32 s30, v18, 14
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s14
@@ -26228,22 +26229,21 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v15, s19
; SI-NEXT: v_mov_b32_e32 v16, s20
; SI-NEXT: v_mov_b32_e32 v17, s21
-; SI-NEXT: v_readlane_b32 s55, v18, 15
-; SI-NEXT: v_readlane_b32 s54, v18, 14
-; SI-NEXT: v_readlane_b32 s53, v18, 13
-; SI-NEXT: v_readlane_b32 s52, v18, 12
-; SI-NEXT: v_readlane_b32 s51, v18, 11
-; SI-NEXT: v_readlane_b32 s50, v18, 10
-; SI-NEXT: v_readlane_b32 s49, v18, 9
-; SI-NEXT: v_readlane_b32 s48, v18, 8
-; SI-NEXT: v_readlane_b32 s39, v18, 7
-; SI-NEXT: v_readlane_b32 s38, v18, 6
-; SI-NEXT: v_readlane_b32 s37, v18, 5
-; SI-NEXT: v_readlane_b32 s36, v18, 4
-; SI-NEXT: v_readlane_b32 s35, v18, 3
-; SI-NEXT: v_readlane_b32 s34, v18, 2
-; SI-NEXT: v_readlane_b32 s31, v18, 1
-; SI-NEXT: v_readlane_b32 s30, v18, 0
+; SI-NEXT: v_readlane_b32 s31, v18, 15
+; SI-NEXT: v_readlane_b32 s55, v18, 13
+; SI-NEXT: v_readlane_b32 s54, v18, 12
+; SI-NEXT: v_readlane_b32 s53, v18, 11
+; SI-NEXT: v_readlane_b32 s52, v18, 10
+; SI-NEXT: v_readlane_b32 s51, v18, 9
+; SI-NEXT: v_readlane_b32 s50, v18, 8
+; SI-NEXT: v_readlane_b32 s49, v18, 7
+; SI-NEXT: v_readlane_b32 s48, v18, 6
+; SI-NEXT: v_readlane_b32 s39, v18, 5
+; SI-NEXT: v_readlane_b32 s38, v18, 4
+; SI-NEXT: v_readlane_b32 s37, v18, 3
+; SI-NEXT: v_readlane_b32 s36, v18, 2
+; SI-NEXT: v_readlane_b32 s35, v18, 1
+; SI-NEXT: v_readlane_b32 s34, v18, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -26858,7 +26858,7 @@ end:
ret <36 x half> %phi
}
-define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
+define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v36i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27458,7 +27458,7 @@ end:
ret <36 x i16> %phi
}
-define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i32 inreg %b) {
+define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v36f16_to_v36i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28342,3 +28342,5 @@ end:
%phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <36 x i16> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 9896de3fe8c5e..233fae3707e43 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) {
+define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -156,7 +156,7 @@ end:
ret <20 x float> %phi
}
-define inreg <20 x float> @bitcast_v20i32_to_v20f32_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v20i32_to_v20f32_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v20f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,7 +386,7 @@ end:
ret <20 x float> %phi
}
-define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) {
+define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -526,7 +526,7 @@ end:
ret <20 x i32> %phi
}
-define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v20i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -746,7 +746,7 @@ end:
ret <20 x i32> %phi
}
-define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) {
+define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -896,7 +896,7 @@ end:
ret <10 x i64> %phi
}
-define inreg <10 x i64> @bitcast_v20i32_to_v10i64_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v20i32_to_v10i64_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v10i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1126,7 +1126,7 @@ end:
ret <10 x i64> %phi
}
-define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) {
+define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1281,7 +1281,7 @@ end:
ret <20 x i32> %phi
}
-define inreg <20 x i32> @bitcast_v10i64_to_v20i32_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v10i64_to_v20i32_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v20i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1516,7 +1516,7 @@ end:
ret <20 x i32> %phi
}
-define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) {
+define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1666,7 +1666,7 @@ end:
ret <10 x double> %phi
}
-define inreg <10 x double> @bitcast_v20i32_to_v10f64_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v20i32_to_v10f64_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v10f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1896,7 +1896,7 @@ end:
ret <10 x double> %phi
}
-define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) {
+define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2006,7 +2006,7 @@ end:
ret <20 x i32> %phi
}
-define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v20i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2196,7 +2196,7 @@ end:
ret <20 x i32> %phi
}
-define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) {
+define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v40i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2774,7 +2774,7 @@ end:
ret <40 x i16> %phi
}
-define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v40i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3531,7 +3531,7 @@ end:
ret <40 x i16> %phi
}
-define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
+define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4379,7 +4379,7 @@ end:
ret <20 x i32> %phi
}
-define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v20i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5135,7 +5135,7 @@ end:
ret <20 x i32> %phi
}
-define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
+define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5713,7 +5713,7 @@ end:
ret <40 x half> %phi
}
-define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20i32_to_v40f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6470,7 +6470,7 @@ end:
ret <40 x half> %phi
}
-define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) {
+define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7403,7 +7403,7 @@ end:
ret <20 x i32> %phi
}
-define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v20i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8180,7 +8180,7 @@ end:
ret <20 x i32> %phi
}
-define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) {
+define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8320,7 +8320,7 @@ end:
ret <10 x i64> %phi
}
-define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v10i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8540,7 +8540,7 @@ end:
ret <10 x i64> %phi
}
-define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) {
+define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8695,7 +8695,7 @@ end:
ret <20 x float> %phi
}
-define inreg <20 x float> @bitcast_v10i64_to_v20f32_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v10i64_to_v20f32_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v20f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8930,7 +8930,7 @@ end:
ret <20 x float> %phi
}
-define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) {
+define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9070,7 +9070,7 @@ end:
ret <10 x double> %phi
}
-define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v10f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9290,7 +9290,7 @@ end:
ret <10 x double> %phi
}
-define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) {
+define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9400,7 +9400,7 @@ end:
ret <20 x float> %phi
}
-define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v20f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9590,7 +9590,7 @@ end:
ret <20 x float> %phi
}
-define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) {
+define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v40i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10148,7 +10148,7 @@ end:
ret <40 x i16> %phi
}
-define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v40i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10913,7 +10913,7 @@ end:
ret <40 x i16> %phi
}
-define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
+define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11761,7 +11761,7 @@ end:
ret <20 x float> %phi
}
-define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v20f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12517,7 +12517,7 @@ end:
ret <20 x float> %phi
}
-define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
+define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13075,7 +13075,7 @@ end:
ret <40 x half> %phi
}
-define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v20f32_to_v40f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13840,7 +13840,7 @@ end:
ret <40 x half> %phi
}
-define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) {
+define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14773,7 +14773,7 @@ end:
ret <20 x float> %phi
}
-define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v20f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15550,7 +15550,7 @@ end:
ret <20 x float> %phi
}
-define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) {
+define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15705,7 +15705,7 @@ end:
ret <10 x double> %phi
}
-define inreg <10 x double> @bitcast_v10i64_to_v10f64_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v10i64_to_v10f64_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v10f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15940,7 +15940,7 @@ end:
ret <10 x double> %phi
}
-define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) {
+define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16050,7 +16050,7 @@ end:
ret <10 x i64> %phi
}
-define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v10i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16240,7 +16240,7 @@ end:
ret <10 x i64> %phi
}
-define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) {
+define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v40i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16828,7 +16828,7 @@ end:
ret <40 x i16> %phi
}
-define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v40i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17585,7 +17585,7 @@ end:
ret <40 x i16> %phi
}
-define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
+define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18433,7 +18433,7 @@ end:
ret <10 x i64> %phi
}
-define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v10i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19189,7 +19189,7 @@ end:
ret <10 x i64> %phi
}
-define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
+define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19777,7 +19777,7 @@ end:
ret <40 x half> %phi
}
-define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10i64_to_v40f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20534,7 +20534,7 @@ end:
ret <40 x half> %phi
}
-define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) {
+define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21467,7 +21467,7 @@ end:
ret <10 x i64> %phi
}
-define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v10i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22244,7 +22244,7 @@ end:
ret <10 x i64> %phi
}
-define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) {
+define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v40i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22772,7 +22772,7 @@ end:
ret <40 x i16> %phi
}
-define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v40i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23507,7 +23507,7 @@ end:
ret <40 x i16> %phi
}
-define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
+define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24355,7 +24355,7 @@ end:
ret <10 x double> %phi
}
-define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v10f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25111,7 +25111,7 @@ end:
ret <10 x double> %phi
}
-define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
+define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25639,7 +25639,7 @@ end:
ret <40 x half> %phi
}
-define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v10f64_to_v40f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26374,7 +26374,7 @@ end:
ret <40 x half> %phi
}
-define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) {
+define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27307,7 +27307,7 @@ end:
ret <10 x double> %phi
}
-define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v10f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28084,7 +28084,7 @@ end:
ret <10 x double> %phi
}
-define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
+define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28952,7 +28952,7 @@ end:
ret <40 x half> %phi
}
-define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i32 inreg %b) {
+define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40i16_to_v40f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28960,29 +28960,30 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v20, s30, 0
-; SI-NEXT: v_writelane_b32 v20, s31, 1
-; SI-NEXT: v_writelane_b32 v20, s34, 2
-; SI-NEXT: v_writelane_b32 v20, s35, 3
-; SI-NEXT: v_writelane_b32 v20, s36, 4
-; SI-NEXT: v_writelane_b32 v20, s37, 5
-; SI-NEXT: v_writelane_b32 v20, s38, 6
-; SI-NEXT: v_writelane_b32 v20, s39, 7
-; SI-NEXT: v_writelane_b32 v20, s48, 8
-; SI-NEXT: v_writelane_b32 v20, s49, 9
-; SI-NEXT: v_writelane_b32 v20, s50, 10
-; SI-NEXT: v_writelane_b32 v20, s51, 11
-; SI-NEXT: v_writelane_b32 v20, s52, 12
-; SI-NEXT: v_writelane_b32 v20, s53, 13
-; SI-NEXT: v_writelane_b32 v20, s54, 14
-; SI-NEXT: v_writelane_b32 v20, s55, 15
-; SI-NEXT: v_writelane_b32 v20, s64, 16
-; SI-NEXT: v_writelane_b32 v20, s65, 17
-; SI-NEXT: v_writelane_b32 v20, s66, 18
-; SI-NEXT: v_writelane_b32 v20, s67, 19
-; SI-NEXT: v_writelane_b32 v20, s68, 20
-; SI-NEXT: v_writelane_b32 v20, s69, 21
-; SI-NEXT: v_writelane_b32 v20, s70, 22
+; SI-NEXT: v_writelane_b32 v20, s34, 0
+; SI-NEXT: v_writelane_b32 v20, s35, 1
+; SI-NEXT: v_writelane_b32 v20, s36, 2
+; SI-NEXT: v_writelane_b32 v20, s37, 3
+; SI-NEXT: v_writelane_b32 v20, s38, 4
+; SI-NEXT: v_writelane_b32 v20, s39, 5
+; SI-NEXT: v_writelane_b32 v20, s48, 6
+; SI-NEXT: v_writelane_b32 v20, s49, 7
+; SI-NEXT: v_writelane_b32 v20, s50, 8
+; SI-NEXT: v_writelane_b32 v20, s51, 9
+; SI-NEXT: v_writelane_b32 v20, s52, 10
+; SI-NEXT: v_writelane_b32 v20, s53, 11
+; SI-NEXT: v_writelane_b32 v20, s54, 12
+; SI-NEXT: v_writelane_b32 v20, s55, 13
+; SI-NEXT: v_writelane_b32 v20, s64, 14
+; SI-NEXT: v_writelane_b32 v20, s65, 15
+; SI-NEXT: v_writelane_b32 v20, s66, 16
+; SI-NEXT: v_writelane_b32 v20, s67, 17
+; SI-NEXT: v_writelane_b32 v20, s68, 18
+; SI-NEXT: v_writelane_b32 v20, s69, 19
+; SI-NEXT: v_writelane_b32 v20, s70, 20
+; SI-NEXT: v_writelane_b32 v20, s71, 21
+; SI-NEXT: v_writelane_b32 v20, s30, 22
+; SI-NEXT: v_writelane_b32 v20, s31, 23
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
@@ -29004,7 +29005,6 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s94, s17, 16
; SI-NEXT: s_lshr_b32 s48, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT: v_writelane_b32 v20, s71, 23
; SI-NEXT: v_readfirstlane_b32 s69, v5
; SI-NEXT: v_readfirstlane_b32 s70, v4
; SI-NEXT: v_readfirstlane_b32 s65, v3
@@ -29283,6 +29283,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_lshl_b32 s24, s39, 16
; SI-NEXT: s_or_b32 s5, s5, s24
+; SI-NEXT: v_readlane_b32 s30, v20, 22
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
@@ -29303,30 +29304,29 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v17, s7
; SI-NEXT: v_mov_b32_e32 v18, s4
; SI-NEXT: v_mov_b32_e32 v19, s5
-; SI-NEXT: v_readlane_b32 s71, v20, 23
-; SI-NEXT: v_readlane_b32 s70, v20, 22
-; SI-NEXT: v_readlane_b32 s69, v20, 21
-; SI-NEXT: v_readlane_b32 s68, v20, 20
-; SI-NEXT: v_readlane_b32 s67, v20, 19
-; SI-NEXT: v_readlane_b32 s66, v20, 18
-; SI-NEXT: v_readlane_b32 s65, v20, 17
-; SI-NEXT: v_readlane_b32 s64, v20, 16
-; SI-NEXT: v_readlane_b32 s55, v20, 15
-; SI-NEXT: v_readlane_b32 s54, v20, 14
-; SI-NEXT: v_readlane_b32 s53, v20, 13
-; SI-NEXT: v_readlane_b32 s52, v20, 12
-; SI-NEXT: v_readlane_b32 s51, v20, 11
-; SI-NEXT: v_readlane_b32 s50, v20, 10
-; SI-NEXT: v_readlane_b32 s49, v20, 9
-; SI-NEXT: v_readlane_b32 s48, v20, 8
-; SI-NEXT: v_readlane_b32 s39, v20, 7
-; SI-NEXT: v_readlane_b32 s38, v20, 6
-; SI-NEXT: v_readlane_b32 s37, v20, 5
-; SI-NEXT: v_readlane_b32 s36, v20, 4
-; SI-NEXT: v_readlane_b32 s35, v20, 3
-; SI-NEXT: v_readlane_b32 s34, v20, 2
-; SI-NEXT: v_readlane_b32 s31, v20, 1
-; SI-NEXT: v_readlane_b32 s30, v20, 0
+; SI-NEXT: v_readlane_b32 s31, v20, 23
+; SI-NEXT: v_readlane_b32 s71, v20, 21
+; SI-NEXT: v_readlane_b32 s70, v20, 20
+; SI-NEXT: v_readlane_b32 s69, v20, 19
+; SI-NEXT: v_readlane_b32 s68, v20, 18
+; SI-NEXT: v_readlane_b32 s67, v20, 17
+; SI-NEXT: v_readlane_b32 s66, v20, 16
+; SI-NEXT: v_readlane_b32 s65, v20, 15
+; SI-NEXT: v_readlane_b32 s64, v20, 14
+; SI-NEXT: v_readlane_b32 s55, v20, 13
+; SI-NEXT: v_readlane_b32 s54, v20, 12
+; SI-NEXT: v_readlane_b32 s53, v20, 11
+; SI-NEXT: v_readlane_b32 s52, v20, 10
+; SI-NEXT: v_readlane_b32 s51, v20, 9
+; SI-NEXT: v_readlane_b32 s50, v20, 8
+; SI-NEXT: v_readlane_b32 s49, v20, 7
+; SI-NEXT: v_readlane_b32 s48, v20, 6
+; SI-NEXT: v_readlane_b32 s39, v20, 5
+; SI-NEXT: v_readlane_b32 s38, v20, 4
+; SI-NEXT: v_readlane_b32 s37, v20, 3
+; SI-NEXT: v_readlane_b32 s36, v20, 2
+; SI-NEXT: v_readlane_b32 s35, v20, 1
+; SI-NEXT: v_readlane_b32 s34, v20, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -29998,7 +29998,7 @@ end:
ret <40 x half> %phi
}
-define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
+define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v40i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30655,7 +30655,7 @@ end:
ret <40 x i16> %phi
}
-define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i32 inreg %b) {
+define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v40f16_to_v40i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31633,3 +31633,5 @@ end:
%phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <40 x i16> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 45e835ddb0e28..d975358b3570f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define double @bitcast_i64_to_f64(i64 %a, i32 %b) {
+define double @bitcast_i64_to_f64(i64 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i64_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -81,7 +81,7 @@ end:
ret double %phi
}
-define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i64_to_f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -168,7 +168,7 @@ end:
ret double %phi
}
-define i64 @bitcast_f64_to_i64(double %a, i32 %b) {
+define i64 @bitcast_f64_to_i64(double %a, i32 %b) #0 {
; SI-LABEL: bitcast_f64_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -239,7 +239,7 @@ end:
ret i64 %phi
}
-define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f64_to_i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -325,7 +325,7 @@ end:
ret i64 %phi
}
-define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) {
+define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i64_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -400,7 +400,7 @@ end:
ret <2 x i32> %phi
}
-define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i64_to_v2i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -487,7 +487,7 @@ end:
ret <2 x i32> %phi
}
-define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) {
+define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i32_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -561,7 +561,7 @@ end:
ret i64 %phi
}
-define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i32_to_i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,7 +648,7 @@ end:
ret i64 %phi
}
-define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) {
+define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i64_to_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -723,7 +723,7 @@ end:
ret <2 x float> %phi
}
-define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i64_to_v2f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -810,7 +810,7 @@ end:
ret <2 x float> %phi
}
-define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) {
+define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f32_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -883,7 +883,7 @@ end:
ret i64 %phi
}
-define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f32_to_i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -973,7 +973,7 @@ end:
ret i64 %phi
}
-define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) {
+define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i64_to_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1062,7 +1062,7 @@ end:
ret <4 x i16> %phi
}
-define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i64_to_v4i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1161,7 +1161,7 @@ end:
ret <4 x i16> %phi
}
-define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) {
+define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i16_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1269,7 +1269,7 @@ end:
ret i64 %phi
}
-define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i16_to_i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1382,7 +1382,7 @@ end:
ret i64 %phi
}
-define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) {
+define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i64_to_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1471,7 +1471,7 @@ end:
ret <4 x half> %phi
}
-define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i64_to_v4f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1570,7 +1570,7 @@ end:
ret <4 x half> %phi
}
-define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) {
+define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f16_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1687,7 +1687,7 @@ end:
ret i64 %phi
}
-define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f16_to_i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1810,7 +1810,7 @@ end:
ret i64 %phi
}
-define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) {
+define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i64_to_v4bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1909,7 +1909,7 @@ end:
ret <4 x bfloat> %phi
}
-define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i64_to_v4bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2014,7 +2014,7 @@ end:
ret <4 x bfloat> %phi
}
-define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
+define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2277,7 +2277,7 @@ end:
ret i64 %phi
}
-define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2560,7 +2560,7 @@ end:
ret i64 %phi
}
-define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) {
+define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) #0 {
; SI-LABEL: bitcast_i64_to_v8i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2772,7 +2772,7 @@ end:
ret <8 x i8> %phi
}
-define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_i64_to_v8i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2952,7 +2952,7 @@ end:
ret <8 x i8> %phi
}
-define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
+define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i8_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3285,7 +3285,7 @@ end:
ret i64 %phi
}
-define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i8_to_i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3509,7 +3509,7 @@ end:
ret i64 %phi
}
-define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) {
+define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) #0 {
; SI-LABEL: bitcast_f64_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3580,7 +3580,7 @@ end:
ret <2 x i32> %phi
}
-define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f64_to_v2i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3666,7 +3666,7 @@ end:
ret <2 x i32> %phi
}
-define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) {
+define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3740,7 +3740,7 @@ end:
ret double %phi
}
-define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i32_to_f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3827,7 +3827,7 @@ end:
ret double %phi
}
-define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) {
+define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) #0 {
; SI-LABEL: bitcast_f64_to_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3898,7 +3898,7 @@ end:
ret <2 x float> %phi
}
-define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f64_to_v2f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3984,7 +3984,7 @@ end:
ret <2 x float> %phi
}
-define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) {
+define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4057,7 +4057,7 @@ end:
ret double %phi
}
-define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f32_to_f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4147,7 +4147,7 @@ end:
ret double %phi
}
-define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) {
+define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) #0 {
; SI-LABEL: bitcast_f64_to_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4232,7 +4232,7 @@ end:
ret <4 x i16> %phi
}
-define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f64_to_v4i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4333,7 +4333,7 @@ end:
ret <4 x i16> %phi
}
-define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) {
+define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i16_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4441,7 +4441,7 @@ end:
ret double %phi
}
-define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i16_to_f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4554,7 +4554,7 @@ end:
ret double %phi
}
-define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) {
+define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) #0 {
; SI-LABEL: bitcast_f64_to_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4639,7 +4639,7 @@ end:
ret <4 x half> %phi
}
-define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f64_to_v4f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4740,7 +4740,7 @@ end:
ret <4 x half> %phi
}
-define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) {
+define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f16_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4857,7 +4857,7 @@ end:
ret double %phi
}
-define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f16_to_f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4980,7 +4980,7 @@ end:
ret double %phi
}
-define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) {
+define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) #0 {
; SI-LABEL: bitcast_f64_to_v4bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5074,7 +5074,7 @@ end:
ret <4 x bfloat> %phi
}
-define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f64_to_v4bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5183,7 +5183,7 @@ end:
ret <4 x bfloat> %phi
}
-define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
+define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5446,7 +5446,7 @@ end:
ret double %phi
}
-define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5729,7 +5729,7 @@ end:
ret double %phi
}
-define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) {
+define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) #0 {
; SI-LABEL: bitcast_f64_to_v8i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5934,7 +5934,7 @@ end:
ret <8 x i8> %phi
}
-define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_f64_to_v8i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6128,7 +6128,7 @@ end:
ret <8 x i8> %phi
}
-define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
+define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i8_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6461,7 +6461,7 @@ end:
ret double %phi
}
-define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i8_to_f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6685,7 +6685,7 @@ end:
ret double %phi
}
-define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) {
+define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6759,7 +6759,7 @@ end:
ret <2 x float> %phi
}
-define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v2f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6846,7 +6846,7 @@ end:
ret <2 x float> %phi
}
-define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) {
+define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6919,7 +6919,7 @@ end:
ret <2 x i32> %phi
}
-define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v2i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7009,7 +7009,7 @@ end:
ret <2 x i32> %phi
}
-define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) {
+define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7097,7 +7097,7 @@ end:
ret <4 x i16> %phi
}
-define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v4i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7196,7 +7196,7 @@ end:
ret <4 x i16> %phi
}
-define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) {
+define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7304,7 +7304,7 @@ end:
ret <2 x i32> %phi
}
-define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v2i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7417,7 +7417,7 @@ end:
ret <2 x i32> %phi
}
-define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) {
+define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7505,7 +7505,7 @@ end:
ret <4 x half> %phi
}
-define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v4f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7604,7 +7604,7 @@ end:
ret <4 x half> %phi
}
-define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) {
+define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7721,7 +7721,7 @@ end:
ret <2 x i32> %phi
}
-define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v2i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7844,7 +7844,7 @@ end:
ret <2 x i32> %phi
}
-define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v4bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7942,7 +7942,7 @@ end:
ret <4 x bfloat> %phi
}
-define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v4bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8047,7 +8047,7 @@ end:
ret <4 x bfloat> %phi
}
-define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
+define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8310,7 +8310,7 @@ end:
ret <2 x i32> %phi
}
-define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v2i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8593,7 +8593,7 @@ end:
ret <2 x i32> %phi
}
-define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) {
+define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v8i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8803,7 +8803,7 @@ end:
ret <8 x i8> %phi
}
-define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2i32_to_v8i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8983,7 +8983,7 @@ end:
ret <8 x i8> %phi
}
-define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
+define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9316,7 +9316,7 @@ end:
ret <2 x i32> %phi
}
-define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v2i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9540,7 +9540,7 @@ end:
ret <2 x i32> %phi
}
-define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) {
+define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9627,7 +9627,7 @@ end:
ret <4 x i16> %phi
}
-define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v4i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9732,7 +9732,7 @@ end:
ret <4 x i16> %phi
}
-define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) {
+define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9840,7 +9840,7 @@ end:
ret <2 x float> %phi
}
-define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v2f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9953,7 +9953,7 @@ end:
ret <2 x float> %phi
}
-define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) {
+define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10040,7 +10040,7 @@ end:
ret <4 x half> %phi
}
-define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v4f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10145,7 +10145,7 @@ end:
ret <4 x half> %phi
}
-define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) {
+define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10262,7 +10262,7 @@ end:
ret <2 x float> %phi
}
-define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v2f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10385,7 +10385,7 @@ end:
ret <2 x float> %phi
}
-define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v4bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10482,7 +10482,7 @@ end:
ret <4 x bfloat> %phi
}
-define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v4bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10595,7 +10595,7 @@ end:
ret <4 x bfloat> %phi
}
-define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
+define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10858,7 +10858,7 @@ end:
ret <2 x float> %phi
}
-define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v2f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11141,7 +11141,7 @@ end:
ret <2 x float> %phi
}
-define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) {
+define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v8i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11349,7 +11349,7 @@ end:
ret <8 x i8> %phi
}
-define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v2f32_to_v8i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11547,7 +11547,7 @@ end:
ret <8 x i8> %phi
}
-define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
+define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11880,7 +11880,7 @@ end:
ret <2 x float> %phi
}
-define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v2f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12104,7 +12104,7 @@ end:
ret <2 x float> %phi
}
-define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) {
+define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12216,7 +12216,7 @@ end:
ret <4 x half> %phi
}
-define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v4f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12340,7 +12340,7 @@ end:
ret <4 x half> %phi
}
-define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) {
+define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12444,7 +12444,7 @@ end:
ret <4 x i16> %phi
}
-define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v4i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12570,7 +12570,7 @@ end:
ret <4 x i16> %phi
}
-define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v4bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12680,7 +12680,7 @@ end:
ret <4 x bfloat> %phi
}
-define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v4bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12804,7 +12804,7 @@ end:
ret <4 x bfloat> %phi
}
-define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
+define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13069,7 +13069,7 @@ end:
ret <4 x i16> %phi
}
-define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v4i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13350,7 +13350,7 @@ end:
ret <4 x i16> %phi
}
-define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) {
+define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v8i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13587,7 +13587,7 @@ end:
ret <8 x i8> %phi
}
-define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4i16_to_v8i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13801,7 +13801,7 @@ end:
ret <8 x i8> %phi
}
-define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
+define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14142,7 +14142,7 @@ end:
ret <4 x i16> %phi
}
-define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v4i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14379,7 +14379,7 @@ end:
ret <4 x i16> %phi
}
-define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v4bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14498,7 +14498,7 @@ end:
ret <4 x bfloat> %phi
}
-define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v4bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14633,7 +14633,7 @@ end:
ret <4 x bfloat> %phi
}
-define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) {
+define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14907,7 +14907,7 @@ end:
ret <4 x half> %phi
}
-define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v4f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15203,7 +15203,7 @@ end:
ret <4 x half> %phi
}
-define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) {
+define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v8i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15444,7 +15444,7 @@ end:
ret <8 x i8> %phi
}
-define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4f16_to_v8i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15668,7 +15668,7 @@ end:
ret <8 x i8> %phi
}
-define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
+define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16009,7 +16009,7 @@ end:
ret <4 x half> %phi
}
-define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v4f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16246,7 +16246,7 @@ end:
ret <4 x half> %phi
}
-define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
+define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v8i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16626,7 +16626,7 @@ end:
ret <8 x i8> %phi
}
-define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v4bf16_to_v8i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17028,7 +17028,7 @@ end:
ret <8 x i8> %phi
}
-define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
+define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v4bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17370,7 +17370,7 @@ end:
ret <4 x bfloat> %phi
}
-define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 inreg %b) {
+define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v8i8_to_v4bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17604,3 +17604,5 @@ end:
%phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <4 x bfloat> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index d3fd1ab06c1c2..40efcdaa730a5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) {
+define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v22f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -164,7 +164,7 @@ end:
ret <22 x float> %phi
}
-define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v22f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -409,7 +409,7 @@ end:
ret <22 x float> %phi
}
-define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) {
+define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v22i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -556,7 +556,7 @@ end:
ret <22 x i32> %phi
}
-define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v22i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -790,7 +790,7 @@ end:
ret <22 x i32> %phi
}
-define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) {
+define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v11i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -948,7 +948,7 @@ end:
ret <11 x i64> %phi
}
-define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v11i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1193,7 +1193,7 @@ end:
ret <11 x i64> %phi
}
-define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) {
+define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v22i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1357,7 +1357,7 @@ end:
ret <22 x i32> %phi
}
-define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v22i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1608,7 +1608,7 @@ end:
ret <22 x i32> %phi
}
-define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) {
+define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v11f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1766,7 +1766,7 @@ end:
ret <11 x double> %phi
}
-define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v11f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2011,7 +2011,7 @@ end:
ret <11 x double> %phi
}
-define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) {
+define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v22i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2125,7 +2125,7 @@ end:
ret <22 x i32> %phi
}
-define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v22i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2326,7 +2326,7 @@ end:
ret <22 x i32> %phi
}
-define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) {
+define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v44i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2952,7 +2952,7 @@ end:
ret <44 x i16> %phi
}
-define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v44i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3772,7 +3772,7 @@ end:
ret <44 x i16> %phi
}
-define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
+define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v22i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4720,7 +4720,7 @@ end:
ret <22 x i32> %phi
}
-define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v22i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5535,7 +5535,7 @@ end:
ret <22 x i32> %phi
}
-define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
+define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v44f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6161,7 +6161,7 @@ end:
ret <44 x half> %phi
}
-define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22i32_to_v44f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6981,7 +6981,7 @@ end:
ret <44 x half> %phi
}
-define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) {
+define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v22i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8028,7 +8028,7 @@ end:
ret <22 x i32> %phi
}
-define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v22i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8870,7 +8870,7 @@ end:
ret <22 x i32> %phi
}
-define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) {
+define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v11i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9017,7 +9017,7 @@ end:
ret <11 x i64> %phi
}
-define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v11i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9251,7 +9251,7 @@ end:
ret <11 x i64> %phi
}
-define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) {
+define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v22f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9415,7 +9415,7 @@ end:
ret <22 x float> %phi
}
-define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v22f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9666,7 +9666,7 @@ end:
ret <22 x float> %phi
}
-define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) {
+define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v11f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9813,7 +9813,7 @@ end:
ret <11 x double> %phi
}
-define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v11f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10047,7 +10047,7 @@ end:
ret <11 x double> %phi
}
-define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) {
+define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v22f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10161,7 +10161,7 @@ end:
ret <22 x float> %phi
}
-define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v22f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10362,7 +10362,7 @@ end:
ret <22 x float> %phi
}
-define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) {
+define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v44i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10966,7 +10966,7 @@ end:
ret <44 x i16> %phi
}
-define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v44i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11799,7 +11799,7 @@ end:
ret <44 x i16> %phi
}
-define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
+define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v22f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12747,7 +12747,7 @@ end:
ret <22 x float> %phi
}
-define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v22f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13562,7 +13562,7 @@ end:
ret <22 x float> %phi
}
-define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
+define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v44f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14166,7 +14166,7 @@ end:
ret <44 x half> %phi
}
-define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v22f32_to_v44f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14999,7 +14999,7 @@ end:
ret <44 x half> %phi
}
-define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) {
+define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v22f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16046,7 +16046,7 @@ end:
ret <22 x float> %phi
}
-define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v22f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16888,7 +16888,7 @@ end:
ret <22 x float> %phi
}
-define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) {
+define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v11f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17052,7 +17052,7 @@ end:
ret <11 x double> %phi
}
-define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v11f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17303,7 +17303,7 @@ end:
ret <11 x double> %phi
}
-define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) {
+define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v11i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17417,7 +17417,7 @@ end:
ret <11 x i64> %phi
}
-define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v11i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17618,7 +17618,7 @@ end:
ret <11 x i64> %phi
}
-define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) {
+define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v44i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18256,7 +18256,7 @@ end:
ret <44 x i16> %phi
}
-define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v44i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19076,7 +19076,7 @@ end:
ret <44 x i16> %phi
}
-define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
+define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v11i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20024,7 +20024,7 @@ end:
ret <11 x i64> %phi
}
-define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v11i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20839,7 +20839,7 @@ end:
ret <11 x i64> %phi
}
-define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
+define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v44f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21477,7 +21477,7 @@ end:
ret <44 x half> %phi
}
-define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11i64_to_v44f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22297,7 +22297,7 @@ end:
ret <44 x half> %phi
}
-define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) {
+define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v11i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23344,7 +23344,7 @@ end:
ret <11 x i64> %phi
}
-define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v11i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24186,7 +24186,7 @@ end:
ret <11 x i64> %phi
}
-define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) {
+define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v44i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24757,7 +24757,7 @@ end:
ret <44 x i16> %phi
}
-define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v44i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25562,7 +25562,7 @@ end:
ret <44 x i16> %phi
}
-define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
+define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v11f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26510,7 +26510,7 @@ end:
ret <11 x double> %phi
}
-define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v11f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27325,7 +27325,7 @@ end:
ret <11 x double> %phi
}
-define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
+define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v44f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27896,7 +27896,7 @@ end:
ret <44 x half> %phi
}
-define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v11f64_to_v44f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28701,7 +28701,7 @@ end:
ret <44 x half> %phi
}
-define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) {
+define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v11f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29748,7 +29748,7 @@ end:
ret <11 x double> %phi
}
-define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v11f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30590,7 +30590,7 @@ end:
ret <11 x double> %phi
}
-define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
+define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v44f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31572,7 +31572,7 @@ end:
ret <44 x half> %phi
}
-define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i32 inreg %b) {
+define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44i16_to_v44f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31580,35 +31580,36 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v22, s30, 0
-; SI-NEXT: v_writelane_b32 v22, s31, 1
-; SI-NEXT: v_writelane_b32 v22, s34, 2
-; SI-NEXT: v_writelane_b32 v22, s35, 3
-; SI-NEXT: v_writelane_b32 v22, s36, 4
-; SI-NEXT: v_writelane_b32 v22, s37, 5
-; SI-NEXT: v_writelane_b32 v22, s38, 6
-; SI-NEXT: v_writelane_b32 v22, s39, 7
-; SI-NEXT: v_writelane_b32 v22, s48, 8
-; SI-NEXT: v_writelane_b32 v22, s49, 9
-; SI-NEXT: v_writelane_b32 v22, s50, 10
-; SI-NEXT: v_writelane_b32 v22, s51, 11
-; SI-NEXT: v_writelane_b32 v22, s52, 12
-; SI-NEXT: v_writelane_b32 v22, s53, 13
-; SI-NEXT: v_writelane_b32 v22, s54, 14
-; SI-NEXT: v_writelane_b32 v22, s55, 15
-; SI-NEXT: v_writelane_b32 v22, s64, 16
-; SI-NEXT: v_writelane_b32 v22, s65, 17
-; SI-NEXT: v_writelane_b32 v22, s66, 18
-; SI-NEXT: v_writelane_b32 v22, s67, 19
-; SI-NEXT: v_writelane_b32 v22, s68, 20
-; SI-NEXT: v_writelane_b32 v22, s69, 21
-; SI-NEXT: v_writelane_b32 v22, s70, 22
-; SI-NEXT: v_writelane_b32 v22, s71, 23
-; SI-NEXT: v_writelane_b32 v22, s80, 24
-; SI-NEXT: v_writelane_b32 v22, s81, 25
-; SI-NEXT: v_writelane_b32 v22, s82, 26
-; SI-NEXT: v_writelane_b32 v22, s83, 27
-; SI-NEXT: v_writelane_b32 v22, s84, 28
+; SI-NEXT: v_writelane_b32 v22, s34, 0
+; SI-NEXT: v_writelane_b32 v22, s35, 1
+; SI-NEXT: v_writelane_b32 v22, s36, 2
+; SI-NEXT: v_writelane_b32 v22, s37, 3
+; SI-NEXT: v_writelane_b32 v22, s38, 4
+; SI-NEXT: v_writelane_b32 v22, s39, 5
+; SI-NEXT: v_writelane_b32 v22, s48, 6
+; SI-NEXT: v_writelane_b32 v22, s49, 7
+; SI-NEXT: v_writelane_b32 v22, s50, 8
+; SI-NEXT: v_writelane_b32 v22, s51, 9
+; SI-NEXT: v_writelane_b32 v22, s52, 10
+; SI-NEXT: v_writelane_b32 v22, s53, 11
+; SI-NEXT: v_writelane_b32 v22, s54, 12
+; SI-NEXT: v_writelane_b32 v22, s55, 13
+; SI-NEXT: v_writelane_b32 v22, s64, 14
+; SI-NEXT: v_writelane_b32 v22, s65, 15
+; SI-NEXT: v_writelane_b32 v22, s66, 16
+; SI-NEXT: v_writelane_b32 v22, s67, 17
+; SI-NEXT: v_writelane_b32 v22, s68, 18
+; SI-NEXT: v_writelane_b32 v22, s69, 19
+; SI-NEXT: v_writelane_b32 v22, s70, 20
+; SI-NEXT: v_writelane_b32 v22, s71, 21
+; SI-NEXT: v_writelane_b32 v22, s80, 22
+; SI-NEXT: v_writelane_b32 v22, s81, 23
+; SI-NEXT: v_writelane_b32 v22, s82, 24
+; SI-NEXT: v_writelane_b32 v22, s83, 25
+; SI-NEXT: v_writelane_b32 v22, s84, 26
+; SI-NEXT: v_writelane_b32 v22, s85, 27
+; SI-NEXT: v_writelane_b32 v22, s30, 28
+; SI-NEXT: v_writelane_b32 v22, s31, 29
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5
@@ -31632,7 +31633,6 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s30, s17, 16
; SI-NEXT: s_lshr_b32 s51, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; SI-NEXT: v_writelane_b32 v22, s85, 29
; SI-NEXT: v_readfirstlane_b32 s82, v7
; SI-NEXT: v_readfirstlane_b32 s84, v6
; SI-NEXT: v_readfirstlane_b32 s71, v5
@@ -31941,6 +31941,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s15, s15, 0xffff
; SI-NEXT: s_lshl_b32 s26, s50, 16
; SI-NEXT: s_or_b32 s15, s15, s26
+; SI-NEXT: v_readlane_b32 s30, v22, 28
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s10
@@ -31963,36 +31964,35 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v19, s25
; SI-NEXT: v_mov_b32_e32 v20, s14
; SI-NEXT: v_mov_b32_e32 v21, s15
-; SI-NEXT: v_readlane_b32 s85, v22, 29
-; SI-NEXT: v_readlane_b32 s84, v22, 28
-; SI-NEXT: v_readlane_b32 s83, v22, 27
-; SI-NEXT: v_readlane_b32 s82, v22, 26
-; SI-NEXT: v_readlane_b32 s81, v22, 25
-; SI-NEXT: v_readlane_b32 s80, v22, 24
-; SI-NEXT: v_readlane_b32 s71, v22, 23
-; SI-NEXT: v_readlane_b32 s70, v22, 22
-; SI-NEXT: v_readlane_b32 s69, v22, 21
-; SI-NEXT: v_readlane_b32 s68, v22, 20
-; SI-NEXT: v_readlane_b32 s67, v22, 19
-; SI-NEXT: v_readlane_b32 s66, v22, 18
-; SI-NEXT: v_readlane_b32 s65, v22, 17
-; SI-NEXT: v_readlane_b32 s64, v22, 16
-; SI-NEXT: v_readlane_b32 s55, v22, 15
-; SI-NEXT: v_readlane_b32 s54, v22, 14
-; SI-NEXT: v_readlane_b32 s53, v22, 13
-; SI-NEXT: v_readlane_b32 s52, v22, 12
-; SI-NEXT: v_readlane_b32 s51, v22, 11
-; SI-NEXT: v_readlane_b32 s50, v22, 10
-; SI-NEXT: v_readlane_b32 s49, v22, 9
-; SI-NEXT: v_readlane_b32 s48, v22, 8
-; SI-NEXT: v_readlane_b32 s39, v22, 7
-; SI-NEXT: v_readlane_b32 s38, v22, 6
-; SI-NEXT: v_readlane_b32 s37, v22, 5
-; SI-NEXT: v_readlane_b32 s36, v22, 4
-; SI-NEXT: v_readlane_b32 s35, v22, 3
-; SI-NEXT: v_readlane_b32 s34, v22, 2
-; SI-NEXT: v_readlane_b32 s31, v22, 1
-; SI-NEXT: v_readlane_b32 s30, v22, 0
+; SI-NEXT: v_readlane_b32 s31, v22, 29
+; SI-NEXT: v_readlane_b32 s85, v22, 27
+; SI-NEXT: v_readlane_b32 s84, v22, 26
+; SI-NEXT: v_readlane_b32 s83, v22, 25
+; SI-NEXT: v_readlane_b32 s82, v22, 24
+; SI-NEXT: v_readlane_b32 s81, v22, 23
+; SI-NEXT: v_readlane_b32 s80, v22, 22
+; SI-NEXT: v_readlane_b32 s71, v22, 21
+; SI-NEXT: v_readlane_b32 s70, v22, 20
+; SI-NEXT: v_readlane_b32 s69, v22, 19
+; SI-NEXT: v_readlane_b32 s68, v22, 18
+; SI-NEXT: v_readlane_b32 s67, v22, 17
+; SI-NEXT: v_readlane_b32 s66, v22, 16
+; SI-NEXT: v_readlane_b32 s65, v22, 15
+; SI-NEXT: v_readlane_b32 s64, v22, 14
+; SI-NEXT: v_readlane_b32 s55, v22, 13
+; SI-NEXT: v_readlane_b32 s54, v22, 12
+; SI-NEXT: v_readlane_b32 s53, v22, 11
+; SI-NEXT: v_readlane_b32 s52, v22, 10
+; SI-NEXT: v_readlane_b32 s51, v22, 9
+; SI-NEXT: v_readlane_b32 s50, v22, 8
+; SI-NEXT: v_readlane_b32 s49, v22, 7
+; SI-NEXT: v_readlane_b32 s48, v22, 6
+; SI-NEXT: v_readlane_b32 s39, v22, 5
+; SI-NEXT: v_readlane_b32 s38, v22, 4
+; SI-NEXT: v_readlane_b32 s37, v22, 3
+; SI-NEXT: v_readlane_b32 s36, v22, 2
+; SI-NEXT: v_readlane_b32 s35, v22, 1
+; SI-NEXT: v_readlane_b32 s34, v22, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -32722,7 +32722,7 @@ end:
ret <44 x half> %phi
}
-define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
+define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v44i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33436,7 +33436,7 @@ end:
ret <44 x i16> %phi
}
-define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) {
+define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v44f16_to_v44i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34512,3 +34512,5 @@ end:
%phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <44 x i16> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index 194ee0705a921..2352c2a1756cf 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) {
+define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -172,7 +172,7 @@ end:
ret <24 x float> %phi
}
-define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v24f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -429,7 +429,7 @@ end:
ret <24 x float> %phi
}
-define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) {
+define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -583,7 +583,7 @@ end:
ret <24 x i32> %phi
}
-define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v24i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -828,7 +828,7 @@ end:
ret <24 x i32> %phi
}
-define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) {
+define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -994,7 +994,7 @@ end:
ret <12 x i64> %phi
}
-define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v12i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1251,7 +1251,7 @@ end:
ret <12 x i64> %phi
}
-define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) {
+define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1423,7 +1423,7 @@ end:
ret <24 x i32> %phi
}
-define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v24i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1686,7 +1686,7 @@ end:
ret <24 x i32> %phi
}
-define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) {
+define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1852,7 +1852,7 @@ end:
ret <12 x double> %phi
}
-define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v12f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2109,7 +2109,7 @@ end:
ret <12 x double> %phi
}
-define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) {
+define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2227,7 +2227,7 @@ end:
ret <24 x i32> %phi
}
-define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v24i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2436,7 +2436,7 @@ end:
ret <24 x i32> %phi
}
-define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) {
+define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v48i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3110,7 +3110,7 @@ end:
ret <48 x i16> %phi
}
-define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3118,13 +3118,13 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v24, s30, 0
+; SI-NEXT: v_writelane_b32 v24, s34, 0
; SI-NEXT: v_mov_b32_e32 v11, s16
; SI-NEXT: v_mov_b32_e32 v12, s17
; SI-NEXT: v_mov_b32_e32 v13, s18
; SI-NEXT: v_mov_b32_e32 v14, s19
; SI-NEXT: v_mov_b32_e32 v15, s20
-; SI-NEXT: v_writelane_b32 v24, s31, 1
+; SI-NEXT: v_writelane_b32 v24, s35, 1
; SI-NEXT: v_mov_b32_e32 v16, s21
; SI-NEXT: v_mov_b32_e32 v17, s22
; SI-NEXT: v_mov_b32_e32 v18, s23
@@ -3140,7 +3140,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s22, v15
; SI-NEXT: v_mov_b32_e32 v15, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: v_writelane_b32 v24, s34, 2
+; SI-NEXT: v_writelane_b32 v24, s30, 2
; SI-NEXT: v_readfirstlane_b32 s23, v16
; SI-NEXT: v_readfirstlane_b32 s20, v17
; SI-NEXT: v_readfirstlane_b32 s21, v18
@@ -3161,7 +3161,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v9
-; SI-NEXT: v_writelane_b32 v24, s35, 3
+; SI-NEXT: v_writelane_b32 v24, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s88, s5, 16
@@ -3311,6 +3311,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s40
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v24, 2
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s24
@@ -3335,10 +3336,9 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v21, s7
; SI-NEXT: v_mov_b32_e32 v22, s4
; SI-NEXT: v_mov_b32_e32 v23, s5
-; SI-NEXT: v_readlane_b32 s35, v24, 3
-; SI-NEXT: v_readlane_b32 s34, v24, 2
-; SI-NEXT: v_readlane_b32 s31, v24, 1
-; SI-NEXT: v_readlane_b32 s30, v24, 0
+; SI-NEXT: v_readlane_b32 s31, v24, 3
+; SI-NEXT: v_readlane_b32 s35, v24, 1
+; SI-NEXT: v_readlane_b32 s34, v24, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -4009,7 +4009,7 @@ end:
ret <48 x i16> %phi
}
-define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
+define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5056,7 +5056,7 @@ end:
ret <24 x i32> %phi
}
-define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v24i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5949,7 +5949,7 @@ end:
ret <24 x i32> %phi
}
-define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
+define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v48f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6623,7 +6623,7 @@ end:
ret <48 x half> %phi
}
-define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24i32_to_v48f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6631,13 +6631,13 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v24, s30, 0
+; SI-NEXT: v_writelane_b32 v24, s34, 0
; SI-NEXT: v_mov_b32_e32 v11, s16
; SI-NEXT: v_mov_b32_e32 v12, s17
; SI-NEXT: v_mov_b32_e32 v13, s18
; SI-NEXT: v_mov_b32_e32 v14, s19
; SI-NEXT: v_mov_b32_e32 v15, s20
-; SI-NEXT: v_writelane_b32 v24, s31, 1
+; SI-NEXT: v_writelane_b32 v24, s35, 1
; SI-NEXT: v_mov_b32_e32 v16, s21
; SI-NEXT: v_mov_b32_e32 v17, s22
; SI-NEXT: v_mov_b32_e32 v18, s23
@@ -6653,7 +6653,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s22, v15
; SI-NEXT: v_mov_b32_e32 v15, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: v_writelane_b32 v24, s34, 2
+; SI-NEXT: v_writelane_b32 v24, s30, 2
; SI-NEXT: v_readfirstlane_b32 s23, v16
; SI-NEXT: v_readfirstlane_b32 s20, v17
; SI-NEXT: v_readfirstlane_b32 s21, v18
@@ -6674,7 +6674,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v9
-; SI-NEXT: v_writelane_b32 v24, s35, 3
+; SI-NEXT: v_writelane_b32 v24, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s88, s5, 16
@@ -6824,6 +6824,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s40
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v24, 2
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s24
@@ -6848,10 +6849,9 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v21, s7
; SI-NEXT: v_mov_b32_e32 v22, s4
; SI-NEXT: v_mov_b32_e32 v23, s5
-; SI-NEXT: v_readlane_b32 s35, v24, 3
-; SI-NEXT: v_readlane_b32 s34, v24, 2
-; SI-NEXT: v_readlane_b32 s31, v24, 1
-; SI-NEXT: v_readlane_b32 s30, v24, 0
+; SI-NEXT: v_readlane_b32 s31, v24, 3
+; SI-NEXT: v_readlane_b32 s35, v24, 1
+; SI-NEXT: v_readlane_b32 s34, v24, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -7522,7 +7522,7 @@ end:
ret <48 x half> %phi
}
-define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
+define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8681,7 +8681,7 @@ end:
ret <24 x i32> %phi
}
-define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v24i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9610,7 +9610,7 @@ end:
ret <24 x i32> %phi
}
-define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) {
+define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9764,7 +9764,7 @@ end:
ret <12 x i64> %phi
}
-define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v12i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10009,7 +10009,7 @@ end:
ret <12 x i64> %phi
}
-define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) {
+define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10181,7 +10181,7 @@ end:
ret <24 x float> %phi
}
-define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v24f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10444,7 +10444,7 @@ end:
ret <24 x float> %phi
}
-define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) {
+define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10598,7 +10598,7 @@ end:
ret <12 x double> %phi
}
-define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v12f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10843,7 +10843,7 @@ end:
ret <12 x double> %phi
}
-define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) {
+define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10961,7 +10961,7 @@ end:
ret <24 x float> %phi
}
-define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v24f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11170,7 +11170,7 @@ end:
ret <24 x float> %phi
}
-define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) {
+define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v48i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11820,7 +11820,7 @@ end:
ret <48 x i16> %phi
}
-define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12742,7 +12742,7 @@ end:
ret <48 x i16> %phi
}
-define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
+define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13789,7 +13789,7 @@ end:
ret <24 x float> %phi
}
-define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v24f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14682,7 +14682,7 @@ end:
ret <24 x float> %phi
}
-define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
+define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v48f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15332,7 +15332,7 @@ end:
ret <48 x half> %phi
}
-define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v24f32_to_v48f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16254,7 +16254,7 @@ end:
ret <48 x half> %phi
}
-define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
+define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17413,7 +17413,7 @@ end:
ret <24 x float> %phi
}
-define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v24f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18342,7 +18342,7 @@ end:
ret <24 x float> %phi
}
-define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) {
+define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18514,7 +18514,7 @@ end:
ret <12 x double> %phi
}
-define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v12f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18777,7 +18777,7 @@ end:
ret <12 x double> %phi
}
-define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) {
+define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18895,7 +18895,7 @@ end:
ret <12 x i64> %phi
}
-define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v12i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19104,7 +19104,7 @@ end:
ret <12 x i64> %phi
}
-define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) {
+define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v48i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19790,7 +19790,7 @@ end:
ret <48 x i16> %phi
}
-define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19798,13 +19798,13 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v24, s30, 0
+; SI-NEXT: v_writelane_b32 v24, s34, 0
; SI-NEXT: v_mov_b32_e32 v11, s16
; SI-NEXT: v_mov_b32_e32 v12, s17
; SI-NEXT: v_mov_b32_e32 v13, s18
; SI-NEXT: v_mov_b32_e32 v14, s19
; SI-NEXT: v_mov_b32_e32 v15, s20
-; SI-NEXT: v_writelane_b32 v24, s31, 1
+; SI-NEXT: v_writelane_b32 v24, s35, 1
; SI-NEXT: v_mov_b32_e32 v16, s21
; SI-NEXT: v_mov_b32_e32 v17, s22
; SI-NEXT: v_mov_b32_e32 v18, s23
@@ -19820,7 +19820,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s22, v15
; SI-NEXT: v_mov_b32_e32 v15, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: v_writelane_b32 v24, s34, 2
+; SI-NEXT: v_writelane_b32 v24, s30, 2
; SI-NEXT: v_readfirstlane_b32 s23, v16
; SI-NEXT: v_readfirstlane_b32 s20, v17
; SI-NEXT: v_readfirstlane_b32 s21, v18
@@ -19841,7 +19841,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v9
-; SI-NEXT: v_writelane_b32 v24, s35, 3
+; SI-NEXT: v_writelane_b32 v24, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s88, s5, 16
@@ -19991,6 +19991,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s40
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v24, 2
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s24
@@ -20015,10 +20016,9 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v21, s7
; SI-NEXT: v_mov_b32_e32 v22, s4
; SI-NEXT: v_mov_b32_e32 v23, s5
-; SI-NEXT: v_readlane_b32 s35, v24, 3
-; SI-NEXT: v_readlane_b32 s34, v24, 2
-; SI-NEXT: v_readlane_b32 s31, v24, 1
-; SI-NEXT: v_readlane_b32 s30, v24, 0
+; SI-NEXT: v_readlane_b32 s31, v24, 3
+; SI-NEXT: v_readlane_b32 s35, v24, 1
+; SI-NEXT: v_readlane_b32 s34, v24, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -20689,7 +20689,7 @@ end:
ret <48 x i16> %phi
}
-define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
+define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21736,7 +21736,7 @@ end:
ret <12 x i64> %phi
}
-define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v12i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22629,7 +22629,7 @@ end:
ret <12 x i64> %phi
}
-define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
+define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v48f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23315,7 +23315,7 @@ end:
ret <48 x half> %phi
}
-define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i64_to_v48f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23323,13 +23323,13 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v24, s30, 0
+; SI-NEXT: v_writelane_b32 v24, s34, 0
; SI-NEXT: v_mov_b32_e32 v11, s16
; SI-NEXT: v_mov_b32_e32 v12, s17
; SI-NEXT: v_mov_b32_e32 v13, s18
; SI-NEXT: v_mov_b32_e32 v14, s19
; SI-NEXT: v_mov_b32_e32 v15, s20
-; SI-NEXT: v_writelane_b32 v24, s31, 1
+; SI-NEXT: v_writelane_b32 v24, s35, 1
; SI-NEXT: v_mov_b32_e32 v16, s21
; SI-NEXT: v_mov_b32_e32 v17, s22
; SI-NEXT: v_mov_b32_e32 v18, s23
@@ -23345,7 +23345,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s22, v15
; SI-NEXT: v_mov_b32_e32 v15, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: v_writelane_b32 v24, s34, 2
+; SI-NEXT: v_writelane_b32 v24, s30, 2
; SI-NEXT: v_readfirstlane_b32 s23, v16
; SI-NEXT: v_readfirstlane_b32 s20, v17
; SI-NEXT: v_readfirstlane_b32 s21, v18
@@ -23366,7 +23366,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v8
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v9
-; SI-NEXT: v_writelane_b32 v24, s35, 3
+; SI-NEXT: v_writelane_b32 v24, s31, 3
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s88, s5, 16
@@ -23516,6 +23516,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s40
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v24, 2
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s24
@@ -23540,10 +23541,9 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v21, s7
; SI-NEXT: v_mov_b32_e32 v22, s4
; SI-NEXT: v_mov_b32_e32 v23, s5
-; SI-NEXT: v_readlane_b32 s35, v24, 3
-; SI-NEXT: v_readlane_b32 s34, v24, 2
-; SI-NEXT: v_readlane_b32 s31, v24, 1
-; SI-NEXT: v_readlane_b32 s30, v24, 0
+; SI-NEXT: v_readlane_b32 s31, v24, 3
+; SI-NEXT: v_readlane_b32 s35, v24, 1
+; SI-NEXT: v_readlane_b32 s34, v24, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -24214,7 +24214,7 @@ end:
ret <48 x half> %phi
}
-define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
+define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25373,7 +25373,7 @@ end:
ret <12 x i64> %phi
}
-define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v12i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26302,7 +26302,7 @@ end:
ret <12 x i64> %phi
}
-define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) {
+define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v48i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26916,7 +26916,7 @@ end:
ret <48 x i16> %phi
}
-define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27802,7 +27802,7 @@ end:
ret <48 x i16> %phi
}
-define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
+define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28849,7 +28849,7 @@ end:
ret <12 x double> %phi
}
-define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v12f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29742,7 +29742,7 @@ end:
ret <12 x double> %phi
}
-define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
+define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v48f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30356,7 +30356,7 @@ end:
ret <48 x half> %phi
}
-define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12f64_to_v48f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31242,7 +31242,7 @@ end:
ret <48 x half> %phi
}
-define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
+define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32401,7 +32401,7 @@ end:
ret <12 x double> %phi
}
-define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v12f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33330,7 +33330,7 @@ end:
ret <12 x double> %phi
}
-define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
+define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v48f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34425,7 +34425,7 @@ end:
ret <48 x half> %phi
}
-define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i32 inreg %b) {
+define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48i16_to_v48f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34433,41 +34433,40 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v24, s30, 0
-; SI-NEXT: v_writelane_b32 v24, s31, 1
-; SI-NEXT: v_writelane_b32 v24, s34, 2
-; SI-NEXT: v_writelane_b32 v24, s35, 3
-; SI-NEXT: v_writelane_b32 v24, s36, 4
-; SI-NEXT: v_writelane_b32 v24, s37, 5
-; SI-NEXT: v_writelane_b32 v24, s38, 6
-; SI-NEXT: v_writelane_b32 v24, s39, 7
-; SI-NEXT: v_writelane_b32 v24, s48, 8
-; SI-NEXT: v_writelane_b32 v24, s49, 9
-; SI-NEXT: v_writelane_b32 v24, s50, 10
-; SI-NEXT: v_writelane_b32 v24, s51, 11
-; SI-NEXT: v_writelane_b32 v24, s52, 12
-; SI-NEXT: v_writelane_b32 v24, s53, 13
-; SI-NEXT: v_writelane_b32 v24, s54, 14
-; SI-NEXT: v_writelane_b32 v24, s55, 15
-; SI-NEXT: v_writelane_b32 v24, s64, 16
-; SI-NEXT: v_writelane_b32 v24, s65, 17
-; SI-NEXT: v_writelane_b32 v24, s66, 18
-; SI-NEXT: v_writelane_b32 v24, s67, 19
-; SI-NEXT: v_writelane_b32 v24, s68, 20
-; SI-NEXT: v_writelane_b32 v24, s69, 21
-; SI-NEXT: v_writelane_b32 v24, s70, 22
-; SI-NEXT: v_writelane_b32 v24, s71, 23
-; SI-NEXT: v_writelane_b32 v24, s80, 24
-; SI-NEXT: v_writelane_b32 v24, s81, 25
-; SI-NEXT: v_writelane_b32 v24, s82, 26
-; SI-NEXT: v_writelane_b32 v24, s83, 27
-; SI-NEXT: v_writelane_b32 v24, s84, 28
-; SI-NEXT: v_writelane_b32 v24, s85, 29
-; SI-NEXT: v_writelane_b32 v24, s86, 30
-; SI-NEXT: v_writelane_b32 v24, s87, 31
-; SI-NEXT: v_writelane_b32 v24, s96, 32
-; SI-NEXT: v_writelane_b32 v24, s97, 33
-; SI-NEXT: v_writelane_b32 v24, s98, 34
+; SI-NEXT: v_writelane_b32 v24, s34, 0
+; SI-NEXT: v_writelane_b32 v24, s35, 1
+; SI-NEXT: v_writelane_b32 v24, s36, 2
+; SI-NEXT: v_writelane_b32 v24, s37, 3
+; SI-NEXT: v_writelane_b32 v24, s38, 4
+; SI-NEXT: v_writelane_b32 v24, s39, 5
+; SI-NEXT: v_writelane_b32 v24, s48, 6
+; SI-NEXT: v_writelane_b32 v24, s49, 7
+; SI-NEXT: v_writelane_b32 v24, s50, 8
+; SI-NEXT: v_writelane_b32 v24, s51, 9
+; SI-NEXT: v_writelane_b32 v24, s52, 10
+; SI-NEXT: v_writelane_b32 v24, s53, 11
+; SI-NEXT: v_writelane_b32 v24, s54, 12
+; SI-NEXT: v_writelane_b32 v24, s55, 13
+; SI-NEXT: v_writelane_b32 v24, s64, 14
+; SI-NEXT: v_writelane_b32 v24, s65, 15
+; SI-NEXT: v_writelane_b32 v24, s66, 16
+; SI-NEXT: v_writelane_b32 v24, s67, 17
+; SI-NEXT: v_writelane_b32 v24, s68, 18
+; SI-NEXT: v_writelane_b32 v24, s69, 19
+; SI-NEXT: v_writelane_b32 v24, s70, 20
+; SI-NEXT: v_writelane_b32 v24, s71, 21
+; SI-NEXT: v_writelane_b32 v24, s80, 22
+; SI-NEXT: v_writelane_b32 v24, s81, 23
+; SI-NEXT: v_writelane_b32 v24, s82, 24
+; SI-NEXT: v_writelane_b32 v24, s83, 25
+; SI-NEXT: v_writelane_b32 v24, s84, 26
+; SI-NEXT: v_writelane_b32 v24, s85, 27
+; SI-NEXT: v_writelane_b32 v24, s86, 28
+; SI-NEXT: v_writelane_b32 v24, s87, 29
+; SI-NEXT: v_writelane_b32 v24, s96, 30
+; SI-NEXT: v_writelane_b32 v24, s97, 31
+; SI-NEXT: v_writelane_b32 v24, s98, 32
+; SI-NEXT: v_writelane_b32 v24, s99, 33
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7
@@ -34494,7 +34493,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s36, s17, 16
; SI-NEXT: s_lshr_b32 s61, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: v_writelane_b32 v24, s99, 35
+; SI-NEXT: v_writelane_b32 v24, s30, 34
; SI-NEXT: v_readfirstlane_b32 s64, v8
; SI-NEXT: v_readfirstlane_b32 s87, v7
; SI-NEXT: v_readfirstlane_b32 s97, v6
@@ -34515,6 +34514,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s96, v18
; SI-NEXT: v_readfirstlane_b32 s51, v19
; SI-NEXT: v_readfirstlane_b32 s84, v9
+; SI-NEXT: v_writelane_b32 v24, s31, 35
; SI-NEXT: s_cbranch_scc0 .LBB57_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s5, s17, 0xffff
@@ -34836,6 +34836,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s27, s47, 0xffff
; SI-NEXT: s_lshl_b32 s28, s55, 16
; SI-NEXT: s_or_b32 s27, s27, s28
+; SI-NEXT: v_readlane_b32 s30, v24, 34
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
@@ -34860,42 +34861,41 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v21, s25
; SI-NEXT: v_mov_b32_e32 v22, s26
; SI-NEXT: v_mov_b32_e32 v23, s27
-; SI-NEXT: v_readlane_b32 s99, v24, 35
-; SI-NEXT: v_readlane_b32 s98, v24, 34
-; SI-NEXT: v_readlane_b32 s97, v24, 33
-; SI-NEXT: v_readlane_b32 s96, v24, 32
-; SI-NEXT: v_readlane_b32 s87, v24, 31
-; SI-NEXT: v_readlane_b32 s86, v24, 30
-; SI-NEXT: v_readlane_b32 s85, v24, 29
-; SI-NEXT: v_readlane_b32 s84, v24, 28
-; SI-NEXT: v_readlane_b32 s83, v24, 27
-; SI-NEXT: v_readlane_b32 s82, v24, 26
-; SI-NEXT: v_readlane_b32 s81, v24, 25
-; SI-NEXT: v_readlane_b32 s80, v24, 24
-; SI-NEXT: v_readlane_b32 s71, v24, 23
-; SI-NEXT: v_readlane_b32 s70, v24, 22
-; SI-NEXT: v_readlane_b32 s69, v24, 21
-; SI-NEXT: v_readlane_b32 s68, v24, 20
-; SI-NEXT: v_readlane_b32 s67, v24, 19
-; SI-NEXT: v_readlane_b32 s66, v24, 18
-; SI-NEXT: v_readlane_b32 s65, v24, 17
-; SI-NEXT: v_readlane_b32 s64, v24, 16
-; SI-NEXT: v_readlane_b32 s55, v24, 15
-; SI-NEXT: v_readlane_b32 s54, v24, 14
-; SI-NEXT: v_readlane_b32 s53, v24, 13
-; SI-NEXT: v_readlane_b32 s52, v24, 12
-; SI-NEXT: v_readlane_b32 s51, v24, 11
-; SI-NEXT: v_readlane_b32 s50, v24, 10
-; SI-NEXT: v_readlane_b32 s49, v24, 9
-; SI-NEXT: v_readlane_b32 s48, v24, 8
-; SI-NEXT: v_readlane_b32 s39, v24, 7
-; SI-NEXT: v_readlane_b32 s38, v24, 6
-; SI-NEXT: v_readlane_b32 s37, v24, 5
-; SI-NEXT: v_readlane_b32 s36, v24, 4
-; SI-NEXT: v_readlane_b32 s35, v24, 3
-; SI-NEXT: v_readlane_b32 s34, v24, 2
-; SI-NEXT: v_readlane_b32 s31, v24, 1
-; SI-NEXT: v_readlane_b32 s30, v24, 0
+; SI-NEXT: v_readlane_b32 s31, v24, 35
+; SI-NEXT: v_readlane_b32 s99, v24, 33
+; SI-NEXT: v_readlane_b32 s98, v24, 32
+; SI-NEXT: v_readlane_b32 s97, v24, 31
+; SI-NEXT: v_readlane_b32 s96, v24, 30
+; SI-NEXT: v_readlane_b32 s87, v24, 29
+; SI-NEXT: v_readlane_b32 s86, v24, 28
+; SI-NEXT: v_readlane_b32 s85, v24, 27
+; SI-NEXT: v_readlane_b32 s84, v24, 26
+; SI-NEXT: v_readlane_b32 s83, v24, 25
+; SI-NEXT: v_readlane_b32 s82, v24, 24
+; SI-NEXT: v_readlane_b32 s81, v24, 23
+; SI-NEXT: v_readlane_b32 s80, v24, 22
+; SI-NEXT: v_readlane_b32 s71, v24, 21
+; SI-NEXT: v_readlane_b32 s70, v24, 20
+; SI-NEXT: v_readlane_b32 s69, v24, 19
+; SI-NEXT: v_readlane_b32 s68, v24, 18
+; SI-NEXT: v_readlane_b32 s67, v24, 17
+; SI-NEXT: v_readlane_b32 s66, v24, 16
+; SI-NEXT: v_readlane_b32 s65, v24, 15
+; SI-NEXT: v_readlane_b32 s64, v24, 14
+; SI-NEXT: v_readlane_b32 s55, v24, 13
+; SI-NEXT: v_readlane_b32 s54, v24, 12
+; SI-NEXT: v_readlane_b32 s53, v24, 11
+; SI-NEXT: v_readlane_b32 s52, v24, 10
+; SI-NEXT: v_readlane_b32 s51, v24, 9
+; SI-NEXT: v_readlane_b32 s50, v24, 8
+; SI-NEXT: v_readlane_b32 s49, v24, 7
+; SI-NEXT: v_readlane_b32 s48, v24, 6
+; SI-NEXT: v_readlane_b32 s39, v24, 5
+; SI-NEXT: v_readlane_b32 s38, v24, 4
+; SI-NEXT: v_readlane_b32 s37, v24, 3
+; SI-NEXT: v_readlane_b32 s36, v24, 2
+; SI-NEXT: v_readlane_b32 s35, v24, 1
+; SI-NEXT: v_readlane_b32 s34, v24, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -35682,7 +35682,7 @@ end:
ret <48 x half> %phi
}
-define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
+define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v48i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36458,7 +36458,7 @@ end:
ret <48 x i16> %phi
}
-define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i32 inreg %b) {
+define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37628,3 +37628,5 @@ end:
%phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <48 x i16> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 60ce818302ce7..1993e506d5dff 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) {
+define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v26f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -180,7 +180,7 @@ end:
ret <26 x float> %phi
}
-define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v26f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -452,7 +452,7 @@ end:
ret <26 x float> %phi
}
-define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) {
+define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v26i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -613,7 +613,7 @@ end:
ret <26 x i32> %phi
}
-define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v26i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -872,7 +872,7 @@ end:
ret <26 x i32> %phi
}
-define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) {
+define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v13i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1046,7 +1046,7 @@ end:
ret <13 x i64> %phi
}
-define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v13i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1318,7 +1318,7 @@ end:
ret <13 x i64> %phi
}
-define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) {
+define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v26i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1499,7 +1499,7 @@ end:
ret <26 x i32> %phi
}
-define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v26i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1778,7 +1778,7 @@ end:
ret <26 x i32> %phi
}
-define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) {
+define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v13f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1952,7 +1952,7 @@ end:
ret <13 x double> %phi
}
-define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v13f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2224,7 +2224,7 @@ end:
ret <13 x double> %phi
}
-define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) {
+define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v26i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2346,7 +2346,7 @@ end:
ret <26 x i32> %phi
}
-define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v26i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2566,7 +2566,7 @@ end:
ret <26 x i32> %phi
}
-define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
+define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3324,7 +3324,7 @@ end:
ret <52 x i16> %phi
}
-define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v52i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3332,12 +3332,12 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v26, s30, 0
-; SI-NEXT: v_writelane_b32 v26, s31, 1
-; SI-NEXT: v_writelane_b32 v26, s34, 2
-; SI-NEXT: v_writelane_b32 v26, s35, 3
-; SI-NEXT: v_writelane_b32 v26, s36, 4
-; SI-NEXT: v_writelane_b32 v26, s37, 5
+; SI-NEXT: v_writelane_b32 v26, s34, 0
+; SI-NEXT: v_writelane_b32 v26, s35, 1
+; SI-NEXT: v_writelane_b32 v26, s36, 2
+; SI-NEXT: v_writelane_b32 v26, s37, 3
+; SI-NEXT: v_writelane_b32 v26, s38, 4
+; SI-NEXT: v_writelane_b32 v26, s39, 5
; SI-NEXT: v_mov_b32_e32 v13, s16
; SI-NEXT: v_mov_b32_e32 v14, s17
; SI-NEXT: v_mov_b32_e32 v15, s18
@@ -3345,7 +3345,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v17, s20
; SI-NEXT: v_mov_b32_e32 v18, s21
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v26, s38, 6
+; SI-NEXT: v_writelane_b32 v26, s48, 6
; SI-NEXT: v_readfirstlane_b32 s42, v13
; SI-NEXT: v_mov_b32_e32 v13, s23
; SI-NEXT: v_readfirstlane_b32 s43, v14
@@ -3361,7 +3361,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s22, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; SI-NEXT: v_writelane_b32 v26, s39, 7
+; SI-NEXT: v_writelane_b32 v26, s30, 7
; SI-NEXT: v_readfirstlane_b32 s23, v13
; SI-NEXT: v_readfirstlane_b32 s20, v14
; SI-NEXT: v_readfirstlane_b32 s21, v15
@@ -3382,7 +3382,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v11
-; SI-NEXT: v_writelane_b32 v26, s48, 8
+; SI-NEXT: v_writelane_b32 v26, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s92, s5, 16
@@ -3544,6 +3544,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s42
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v26, 7
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s40
@@ -3570,15 +3571,14 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v23, s7
; SI-NEXT: v_mov_b32_e32 v24, s4
; SI-NEXT: v_mov_b32_e32 v25, s5
-; SI-NEXT: v_readlane_b32 s48, v26, 8
-; SI-NEXT: v_readlane_b32 s39, v26, 7
-; SI-NEXT: v_readlane_b32 s38, v26, 6
-; SI-NEXT: v_readlane_b32 s37, v26, 5
-; SI-NEXT: v_readlane_b32 s36, v26, 4
-; SI-NEXT: v_readlane_b32 s35, v26, 3
-; SI-NEXT: v_readlane_b32 s34, v26, 2
-; SI-NEXT: v_readlane_b32 s31, v26, 1
-; SI-NEXT: v_readlane_b32 s30, v26, 0
+; SI-NEXT: v_readlane_b32 s31, v26, 8
+; SI-NEXT: v_readlane_b32 s48, v26, 6
+; SI-NEXT: v_readlane_b32 s39, v26, 5
+; SI-NEXT: v_readlane_b32 s38, v26, 4
+; SI-NEXT: v_readlane_b32 s37, v26, 3
+; SI-NEXT: v_readlane_b32 s36, v26, 2
+; SI-NEXT: v_readlane_b32 s35, v26, 1
+; SI-NEXT: v_readlane_b32 s34, v26, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -4296,7 +4296,7 @@ end:
ret <52 x i16> %phi
}
-define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
+define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v26i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5445,7 +5445,7 @@ end:
ret <26 x i32> %phi
}
-define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v26i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6413,7 +6413,7 @@ end:
ret <26 x i32> %phi
}
-define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
+define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7171,7 +7171,7 @@ end:
ret <52 x half> %phi
}
-define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26i32_to_v52f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7179,12 +7179,12 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v26, s30, 0
-; SI-NEXT: v_writelane_b32 v26, s31, 1
-; SI-NEXT: v_writelane_b32 v26, s34, 2
-; SI-NEXT: v_writelane_b32 v26, s35, 3
-; SI-NEXT: v_writelane_b32 v26, s36, 4
-; SI-NEXT: v_writelane_b32 v26, s37, 5
+; SI-NEXT: v_writelane_b32 v26, s34, 0
+; SI-NEXT: v_writelane_b32 v26, s35, 1
+; SI-NEXT: v_writelane_b32 v26, s36, 2
+; SI-NEXT: v_writelane_b32 v26, s37, 3
+; SI-NEXT: v_writelane_b32 v26, s38, 4
+; SI-NEXT: v_writelane_b32 v26, s39, 5
; SI-NEXT: v_mov_b32_e32 v13, s16
; SI-NEXT: v_mov_b32_e32 v14, s17
; SI-NEXT: v_mov_b32_e32 v15, s18
@@ -7192,7 +7192,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v17, s20
; SI-NEXT: v_mov_b32_e32 v18, s21
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v26, s38, 6
+; SI-NEXT: v_writelane_b32 v26, s48, 6
; SI-NEXT: v_readfirstlane_b32 s42, v13
; SI-NEXT: v_mov_b32_e32 v13, s23
; SI-NEXT: v_readfirstlane_b32 s43, v14
@@ -7208,7 +7208,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s22, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; SI-NEXT: v_writelane_b32 v26, s39, 7
+; SI-NEXT: v_writelane_b32 v26, s30, 7
; SI-NEXT: v_readfirstlane_b32 s23, v13
; SI-NEXT: v_readfirstlane_b32 s20, v14
; SI-NEXT: v_readfirstlane_b32 s21, v15
@@ -7229,7 +7229,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v11
-; SI-NEXT: v_writelane_b32 v26, s48, 8
+; SI-NEXT: v_writelane_b32 v26, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s92, s5, 16
@@ -7391,6 +7391,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s42
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v26, 7
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s40
@@ -7417,15 +7418,14 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v23, s7
; SI-NEXT: v_mov_b32_e32 v24, s4
; SI-NEXT: v_mov_b32_e32 v25, s5
-; SI-NEXT: v_readlane_b32 s48, v26, 8
-; SI-NEXT: v_readlane_b32 s39, v26, 7
-; SI-NEXT: v_readlane_b32 s38, v26, 6
-; SI-NEXT: v_readlane_b32 s37, v26, 5
-; SI-NEXT: v_readlane_b32 s36, v26, 4
-; SI-NEXT: v_readlane_b32 s35, v26, 3
-; SI-NEXT: v_readlane_b32 s34, v26, 2
-; SI-NEXT: v_readlane_b32 s31, v26, 1
-; SI-NEXT: v_readlane_b32 s30, v26, 0
+; SI-NEXT: v_readlane_b32 s31, v26, 8
+; SI-NEXT: v_readlane_b32 s48, v26, 6
+; SI-NEXT: v_readlane_b32 s39, v26, 5
+; SI-NEXT: v_readlane_b32 s38, v26, 4
+; SI-NEXT: v_readlane_b32 s37, v26, 3
+; SI-NEXT: v_readlane_b32 s36, v26, 2
+; SI-NEXT: v_readlane_b32 s35, v26, 1
+; SI-NEXT: v_readlane_b32 s34, v26, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -8143,7 +8143,7 @@ end:
ret <52 x half> %phi
}
-define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
+define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v26i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9413,7 +9413,7 @@ end:
ret <26 x i32> %phi
}
-define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v26i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10426,7 +10426,7 @@ end:
ret <26 x i32> %phi
}
-define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) {
+define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v13i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10587,7 +10587,7 @@ end:
ret <13 x i64> %phi
}
-define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v13i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10846,7 +10846,7 @@ end:
ret <13 x i64> %phi
}
-define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) {
+define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v26f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11027,7 +11027,7 @@ end:
ret <26 x float> %phi
}
-define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v26f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11306,7 +11306,7 @@ end:
ret <26 x float> %phi
}
-define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) {
+define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v13f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11467,7 +11467,7 @@ end:
ret <13 x double> %phi
}
-define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v13f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11726,7 +11726,7 @@ end:
ret <13 x double> %phi
}
-define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) {
+define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v26f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11848,7 +11848,7 @@ end:
ret <26 x float> %phi
}
-define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v26f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12068,7 +12068,7 @@ end:
ret <26 x float> %phi
}
-define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
+define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12800,7 +12800,7 @@ end:
ret <52 x i16> %phi
}
-define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v52i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13824,7 +13824,7 @@ end:
ret <52 x i16> %phi
}
-define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
+define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v26f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14973,7 +14973,7 @@ end:
ret <26 x float> %phi
}
-define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v26f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15941,7 +15941,7 @@ end:
ret <26 x float> %phi
}
-define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
+define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16673,7 +16673,7 @@ end:
ret <52 x half> %phi
}
-define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v26f32_to_v52f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17697,7 +17697,7 @@ end:
ret <52 x half> %phi
}
-define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
+define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v26f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18967,7 +18967,7 @@ end:
ret <26 x float> %phi
}
-define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v26f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19980,7 +19980,7 @@ end:
ret <26 x float> %phi
}
-define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) {
+define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v13f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20161,7 +20161,7 @@ end:
ret <13 x double> %phi
}
-define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v13f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20440,7 +20440,7 @@ end:
ret <13 x double> %phi
}
-define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) {
+define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v13i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20562,7 +20562,7 @@ end:
ret <13 x i64> %phi
}
-define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v13i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20782,7 +20782,7 @@ end:
ret <13 x i64> %phi
}
-define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
+define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21554,7 +21554,7 @@ end:
ret <52 x i16> %phi
}
-define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v52i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21562,12 +21562,12 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v26, s30, 0
-; SI-NEXT: v_writelane_b32 v26, s31, 1
-; SI-NEXT: v_writelane_b32 v26, s34, 2
-; SI-NEXT: v_writelane_b32 v26, s35, 3
-; SI-NEXT: v_writelane_b32 v26, s36, 4
-; SI-NEXT: v_writelane_b32 v26, s37, 5
+; SI-NEXT: v_writelane_b32 v26, s34, 0
+; SI-NEXT: v_writelane_b32 v26, s35, 1
+; SI-NEXT: v_writelane_b32 v26, s36, 2
+; SI-NEXT: v_writelane_b32 v26, s37, 3
+; SI-NEXT: v_writelane_b32 v26, s38, 4
+; SI-NEXT: v_writelane_b32 v26, s39, 5
; SI-NEXT: v_mov_b32_e32 v13, s16
; SI-NEXT: v_mov_b32_e32 v14, s17
; SI-NEXT: v_mov_b32_e32 v15, s18
@@ -21575,7 +21575,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v17, s20
; SI-NEXT: v_mov_b32_e32 v18, s21
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v26, s38, 6
+; SI-NEXT: v_writelane_b32 v26, s48, 6
; SI-NEXT: v_readfirstlane_b32 s42, v13
; SI-NEXT: v_mov_b32_e32 v13, s23
; SI-NEXT: v_readfirstlane_b32 s43, v14
@@ -21591,7 +21591,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s22, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; SI-NEXT: v_writelane_b32 v26, s39, 7
+; SI-NEXT: v_writelane_b32 v26, s30, 7
; SI-NEXT: v_readfirstlane_b32 s23, v13
; SI-NEXT: v_readfirstlane_b32 s20, v14
; SI-NEXT: v_readfirstlane_b32 s21, v15
@@ -21612,7 +21612,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v11
-; SI-NEXT: v_writelane_b32 v26, s48, 8
+; SI-NEXT: v_writelane_b32 v26, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s92, s5, 16
@@ -21774,6 +21774,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s42
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v26, 7
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s40
@@ -21800,15 +21801,14 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v23, s7
; SI-NEXT: v_mov_b32_e32 v24, s4
; SI-NEXT: v_mov_b32_e32 v25, s5
-; SI-NEXT: v_readlane_b32 s48, v26, 8
-; SI-NEXT: v_readlane_b32 s39, v26, 7
-; SI-NEXT: v_readlane_b32 s38, v26, 6
-; SI-NEXT: v_readlane_b32 s37, v26, 5
-; SI-NEXT: v_readlane_b32 s36, v26, 4
-; SI-NEXT: v_readlane_b32 s35, v26, 3
-; SI-NEXT: v_readlane_b32 s34, v26, 2
-; SI-NEXT: v_readlane_b32 s31, v26, 1
-; SI-NEXT: v_readlane_b32 s30, v26, 0
+; SI-NEXT: v_readlane_b32 s31, v26, 8
+; SI-NEXT: v_readlane_b32 s48, v26, 6
+; SI-NEXT: v_readlane_b32 s39, v26, 5
+; SI-NEXT: v_readlane_b32 s38, v26, 4
+; SI-NEXT: v_readlane_b32 s37, v26, 3
+; SI-NEXT: v_readlane_b32 s36, v26, 2
+; SI-NEXT: v_readlane_b32 s35, v26, 1
+; SI-NEXT: v_readlane_b32 s34, v26, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -22526,7 +22526,7 @@ end:
ret <52 x i16> %phi
}
-define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
+define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v13i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23675,7 +23675,7 @@ end:
ret <13 x i64> %phi
}
-define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v13i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24643,7 +24643,7 @@ end:
ret <13 x i64> %phi
}
-define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
+define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25415,7 +25415,7 @@ end:
ret <52 x half> %phi
}
-define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13i64_to_v52f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25423,12 +25423,12 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v26, s30, 0
-; SI-NEXT: v_writelane_b32 v26, s31, 1
-; SI-NEXT: v_writelane_b32 v26, s34, 2
-; SI-NEXT: v_writelane_b32 v26, s35, 3
-; SI-NEXT: v_writelane_b32 v26, s36, 4
-; SI-NEXT: v_writelane_b32 v26, s37, 5
+; SI-NEXT: v_writelane_b32 v26, s34, 0
+; SI-NEXT: v_writelane_b32 v26, s35, 1
+; SI-NEXT: v_writelane_b32 v26, s36, 2
+; SI-NEXT: v_writelane_b32 v26, s37, 3
+; SI-NEXT: v_writelane_b32 v26, s38, 4
+; SI-NEXT: v_writelane_b32 v26, s39, 5
; SI-NEXT: v_mov_b32_e32 v13, s16
; SI-NEXT: v_mov_b32_e32 v14, s17
; SI-NEXT: v_mov_b32_e32 v15, s18
@@ -25436,7 +25436,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v17, s20
; SI-NEXT: v_mov_b32_e32 v18, s21
; SI-NEXT: v_mov_b32_e32 v19, s22
-; SI-NEXT: v_writelane_b32 v26, s38, 6
+; SI-NEXT: v_writelane_b32 v26, s48, 6
; SI-NEXT: v_readfirstlane_b32 s42, v13
; SI-NEXT: v_mov_b32_e32 v13, s23
; SI-NEXT: v_readfirstlane_b32 s43, v14
@@ -25452,7 +25452,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s22, v19
; SI-NEXT: v_mov_b32_e32 v19, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; SI-NEXT: v_writelane_b32 v26, s39, 7
+; SI-NEXT: v_writelane_b32 v26, s30, 7
; SI-NEXT: v_readfirstlane_b32 s23, v13
; SI-NEXT: v_readfirstlane_b32 s20, v14
; SI-NEXT: v_readfirstlane_b32 s21, v15
@@ -25473,7 +25473,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v11
-; SI-NEXT: v_writelane_b32 v26, s48, 8
+; SI-NEXT: v_writelane_b32 v26, s31, 8
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s92, s5, 16
@@ -25635,6 +25635,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s42
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v26, 7
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s40
@@ -25661,15 +25662,14 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v23, s7
; SI-NEXT: v_mov_b32_e32 v24, s4
; SI-NEXT: v_mov_b32_e32 v25, s5
-; SI-NEXT: v_readlane_b32 s48, v26, 8
-; SI-NEXT: v_readlane_b32 s39, v26, 7
-; SI-NEXT: v_readlane_b32 s38, v26, 6
-; SI-NEXT: v_readlane_b32 s37, v26, 5
-; SI-NEXT: v_readlane_b32 s36, v26, 4
-; SI-NEXT: v_readlane_b32 s35, v26, 3
-; SI-NEXT: v_readlane_b32 s34, v26, 2
-; SI-NEXT: v_readlane_b32 s31, v26, 1
-; SI-NEXT: v_readlane_b32 s30, v26, 0
+; SI-NEXT: v_readlane_b32 s31, v26, 8
+; SI-NEXT: v_readlane_b32 s48, v26, 6
+; SI-NEXT: v_readlane_b32 s39, v26, 5
+; SI-NEXT: v_readlane_b32 s38, v26, 4
+; SI-NEXT: v_readlane_b32 s37, v26, 3
+; SI-NEXT: v_readlane_b32 s36, v26, 2
+; SI-NEXT: v_readlane_b32 s35, v26, 1
+; SI-NEXT: v_readlane_b32 s34, v26, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -26387,7 +26387,7 @@ end:
ret <52 x half> %phi
}
-define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
+define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v13i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27657,7 +27657,7 @@ end:
ret <13 x i64> %phi
}
-define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v13i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28670,7 +28670,7 @@ end:
ret <13 x i64> %phi
}
-define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
+define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29363,7 +29363,7 @@ end:
ret <52 x i16> %phi
}
-define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v52i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30346,7 +30346,7 @@ end:
ret <52 x i16> %phi
}
-define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
+define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v13f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31495,7 +31495,7 @@ end:
ret <13 x double> %phi
}
-define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v13f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32463,7 +32463,7 @@ end:
ret <13 x double> %phi
}
-define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
+define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33156,7 +33156,7 @@ end:
ret <52 x half> %phi
}
-define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v13f64_to_v52f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34139,7 +34139,7 @@ end:
ret <52 x half> %phi
}
-define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
+define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v13f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35409,7 +35409,7 @@ end:
ret <13 x double> %phi
}
-define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v13f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36422,7 +36422,7 @@ end:
ret <13 x double> %phi
}
-define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
+define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v52f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37648,7 +37648,7 @@ end:
ret <52 x half> %phi
}
-define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i32 inreg %b) {
+define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52i16_to_v52f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37656,41 +37656,41 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v26, s30, 0
-; SI-NEXT: v_writelane_b32 v26, s31, 1
-; SI-NEXT: v_writelane_b32 v26, s34, 2
-; SI-NEXT: v_writelane_b32 v26, s35, 3
-; SI-NEXT: v_writelane_b32 v26, s36, 4
-; SI-NEXT: v_writelane_b32 v26, s37, 5
-; SI-NEXT: v_writelane_b32 v26, s38, 6
-; SI-NEXT: v_writelane_b32 v26, s39, 7
-; SI-NEXT: v_writelane_b32 v26, s48, 8
-; SI-NEXT: v_writelane_b32 v26, s49, 9
-; SI-NEXT: v_writelane_b32 v26, s50, 10
-; SI-NEXT: v_writelane_b32 v26, s51, 11
-; SI-NEXT: v_writelane_b32 v26, s52, 12
-; SI-NEXT: v_writelane_b32 v26, s53, 13
-; SI-NEXT: v_writelane_b32 v26, s54, 14
-; SI-NEXT: v_writelane_b32 v26, s55, 15
-; SI-NEXT: v_writelane_b32 v26, s64, 16
-; SI-NEXT: v_writelane_b32 v26, s65, 17
-; SI-NEXT: v_writelane_b32 v26, s66, 18
-; SI-NEXT: v_writelane_b32 v26, s67, 19
-; SI-NEXT: v_writelane_b32 v26, s68, 20
-; SI-NEXT: v_writelane_b32 v26, s69, 21
-; SI-NEXT: v_writelane_b32 v26, s70, 22
-; SI-NEXT: v_writelane_b32 v26, s71, 23
-; SI-NEXT: v_writelane_b32 v26, s80, 24
-; SI-NEXT: v_writelane_b32 v26, s81, 25
-; SI-NEXT: v_writelane_b32 v26, s82, 26
-; SI-NEXT: v_writelane_b32 v26, s83, 27
-; SI-NEXT: v_writelane_b32 v26, s84, 28
-; SI-NEXT: v_writelane_b32 v26, s85, 29
-; SI-NEXT: v_writelane_b32 v26, s86, 30
-; SI-NEXT: v_writelane_b32 v26, s87, 31
-; SI-NEXT: v_writelane_b32 v26, s96, 32
-; SI-NEXT: v_writelane_b32 v26, s97, 33
-; SI-NEXT: v_writelane_b32 v26, s98, 34
+; SI-NEXT: v_writelane_b32 v26, s34, 0
+; SI-NEXT: v_writelane_b32 v26, s35, 1
+; SI-NEXT: v_writelane_b32 v26, s36, 2
+; SI-NEXT: v_writelane_b32 v26, s37, 3
+; SI-NEXT: v_writelane_b32 v26, s38, 4
+; SI-NEXT: v_writelane_b32 v26, s39, 5
+; SI-NEXT: v_writelane_b32 v26, s48, 6
+; SI-NEXT: v_writelane_b32 v26, s49, 7
+; SI-NEXT: v_writelane_b32 v26, s50, 8
+; SI-NEXT: v_writelane_b32 v26, s51, 9
+; SI-NEXT: v_writelane_b32 v26, s52, 10
+; SI-NEXT: v_writelane_b32 v26, s53, 11
+; SI-NEXT: v_writelane_b32 v26, s54, 12
+; SI-NEXT: v_writelane_b32 v26, s55, 13
+; SI-NEXT: v_writelane_b32 v26, s64, 14
+; SI-NEXT: v_writelane_b32 v26, s65, 15
+; SI-NEXT: v_writelane_b32 v26, s66, 16
+; SI-NEXT: v_writelane_b32 v26, s67, 17
+; SI-NEXT: v_writelane_b32 v26, s68, 18
+; SI-NEXT: v_writelane_b32 v26, s69, 19
+; SI-NEXT: v_writelane_b32 v26, s70, 20
+; SI-NEXT: v_writelane_b32 v26, s71, 21
+; SI-NEXT: v_writelane_b32 v26, s80, 22
+; SI-NEXT: v_writelane_b32 v26, s81, 23
+; SI-NEXT: v_writelane_b32 v26, s82, 24
+; SI-NEXT: v_writelane_b32 v26, s83, 25
+; SI-NEXT: v_writelane_b32 v26, s84, 26
+; SI-NEXT: v_writelane_b32 v26, s85, 27
+; SI-NEXT: v_writelane_b32 v26, s86, 28
+; SI-NEXT: v_writelane_b32 v26, s87, 29
+; SI-NEXT: v_writelane_b32 v26, s96, 30
+; SI-NEXT: v_writelane_b32 v26, s97, 31
+; SI-NEXT: v_writelane_b32 v26, s98, 32
+; SI-NEXT: v_writelane_b32 v26, s99, 33
+; SI-NEXT: v_writelane_b32 v26, s30, 34
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9
@@ -37723,7 +37723,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s48, s17, 16
; SI-NEXT: s_lshr_b32 s63, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; SI-NEXT: v_writelane_b32 v26, s99, 35
+; SI-NEXT: v_writelane_b32 v26, s31, 35
; SI-NEXT: v_readfirstlane_b32 s80, v6
; SI-NEXT: v_readfirstlane_b32 s97, v5
; SI-NEXT: v_readfirstlane_b32 s99, v4
@@ -38107,6 +38107,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s29, s41, 0xffff
; SI-NEXT: s_lshl_b32 s40, s66, 16
; SI-NEXT: s_or_b32 s29, s29, s40
+; SI-NEXT: v_readlane_b32 s30, v26, 34
; SI-NEXT: v_mov_b32_e32 v0, s14
; SI-NEXT: v_mov_b32_e32 v1, s15
; SI-NEXT: v_mov_b32_e32 v2, s12
@@ -38133,42 +38134,41 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v23, s27
; SI-NEXT: v_mov_b32_e32 v24, s28
; SI-NEXT: v_mov_b32_e32 v25, s29
-; SI-NEXT: v_readlane_b32 s99, v26, 35
-; SI-NEXT: v_readlane_b32 s98, v26, 34
-; SI-NEXT: v_readlane_b32 s97, v26, 33
-; SI-NEXT: v_readlane_b32 s96, v26, 32
-; SI-NEXT: v_readlane_b32 s87, v26, 31
-; SI-NEXT: v_readlane_b32 s86, v26, 30
-; SI-NEXT: v_readlane_b32 s85, v26, 29
-; SI-NEXT: v_readlane_b32 s84, v26, 28
-; SI-NEXT: v_readlane_b32 s83, v26, 27
-; SI-NEXT: v_readlane_b32 s82, v26, 26
-; SI-NEXT: v_readlane_b32 s81, v26, 25
-; SI-NEXT: v_readlane_b32 s80, v26, 24
-; SI-NEXT: v_readlane_b32 s71, v26, 23
-; SI-NEXT: v_readlane_b32 s70, v26, 22
-; SI-NEXT: v_readlane_b32 s69, v26, 21
-; SI-NEXT: v_readlane_b32 s68, v26, 20
-; SI-NEXT: v_readlane_b32 s67, v26, 19
-; SI-NEXT: v_readlane_b32 s66, v26, 18
-; SI-NEXT: v_readlane_b32 s65, v26, 17
-; SI-NEXT: v_readlane_b32 s64, v26, 16
-; SI-NEXT: v_readlane_b32 s55, v26, 15
-; SI-NEXT: v_readlane_b32 s54, v26, 14
-; SI-NEXT: v_readlane_b32 s53, v26, 13
-; SI-NEXT: v_readlane_b32 s52, v26, 12
-; SI-NEXT: v_readlane_b32 s51, v26, 11
-; SI-NEXT: v_readlane_b32 s50, v26, 10
-; SI-NEXT: v_readlane_b32 s49, v26, 9
-; SI-NEXT: v_readlane_b32 s48, v26, 8
-; SI-NEXT: v_readlane_b32 s39, v26, 7
-; SI-NEXT: v_readlane_b32 s38, v26, 6
-; SI-NEXT: v_readlane_b32 s37, v26, 5
-; SI-NEXT: v_readlane_b32 s36, v26, 4
-; SI-NEXT: v_readlane_b32 s35, v26, 3
-; SI-NEXT: v_readlane_b32 s34, v26, 2
-; SI-NEXT: v_readlane_b32 s31, v26, 1
-; SI-NEXT: v_readlane_b32 s30, v26, 0
+; SI-NEXT: v_readlane_b32 s31, v26, 35
+; SI-NEXT: v_readlane_b32 s99, v26, 33
+; SI-NEXT: v_readlane_b32 s98, v26, 32
+; SI-NEXT: v_readlane_b32 s97, v26, 31
+; SI-NEXT: v_readlane_b32 s96, v26, 30
+; SI-NEXT: v_readlane_b32 s87, v26, 29
+; SI-NEXT: v_readlane_b32 s86, v26, 28
+; SI-NEXT: v_readlane_b32 s85, v26, 27
+; SI-NEXT: v_readlane_b32 s84, v26, 26
+; SI-NEXT: v_readlane_b32 s83, v26, 25
+; SI-NEXT: v_readlane_b32 s82, v26, 24
+; SI-NEXT: v_readlane_b32 s81, v26, 23
+; SI-NEXT: v_readlane_b32 s80, v26, 22
+; SI-NEXT: v_readlane_b32 s71, v26, 21
+; SI-NEXT: v_readlane_b32 s70, v26, 20
+; SI-NEXT: v_readlane_b32 s69, v26, 19
+; SI-NEXT: v_readlane_b32 s68, v26, 18
+; SI-NEXT: v_readlane_b32 s67, v26, 17
+; SI-NEXT: v_readlane_b32 s66, v26, 16
+; SI-NEXT: v_readlane_b32 s65, v26, 15
+; SI-NEXT: v_readlane_b32 s64, v26, 14
+; SI-NEXT: v_readlane_b32 s55, v26, 13
+; SI-NEXT: v_readlane_b32 s54, v26, 12
+; SI-NEXT: v_readlane_b32 s53, v26, 11
+; SI-NEXT: v_readlane_b32 s52, v26, 10
+; SI-NEXT: v_readlane_b32 s51, v26, 9
+; SI-NEXT: v_readlane_b32 s50, v26, 8
+; SI-NEXT: v_readlane_b32 s49, v26, 7
+; SI-NEXT: v_readlane_b32 s48, v26, 6
+; SI-NEXT: v_readlane_b32 s39, v26, 5
+; SI-NEXT: v_readlane_b32 s38, v26, 4
+; SI-NEXT: v_readlane_b32 s37, v26, 3
+; SI-NEXT: v_readlane_b32 s36, v26, 2
+; SI-NEXT: v_readlane_b32 s35, v26, 1
+; SI-NEXT: v_readlane_b32 s34, v26, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -39022,7 +39022,7 @@ end:
ret <52 x half> %phi
}
-define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
+define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v52i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39896,7 +39896,7 @@ end:
ret <52 x i16> %phi
}
-define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i32 inreg %b) {
+define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41182,3 +41182,5 @@ end:
%phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <52 x i16> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 19462c5bf8a9f..cad6fe4a4e746 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) {
+define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v28f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -188,7 +188,7 @@ end:
ret <28 x float> %phi
}
-define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v28f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -478,7 +478,7 @@ end:
ret <28 x float> %phi
}
-define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) {
+define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v28i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -646,7 +646,7 @@ end:
ret <28 x i32> %phi
}
-define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v28i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -922,7 +922,7 @@ end:
ret <28 x i32> %phi
}
-define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) {
+define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v14i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1104,7 +1104,7 @@ end:
ret <14 x i64> %phi
}
-define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v14i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1394,7 +1394,7 @@ end:
ret <14 x i64> %phi
}
-define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) {
+define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v28i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1583,7 +1583,7 @@ end:
ret <28 x i32> %phi
}
-define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v28i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1880,7 +1880,7 @@ end:
ret <28 x i32> %phi
}
-define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) {
+define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v14f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2062,7 +2062,7 @@ end:
ret <14 x double> %phi
}
-define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v14f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2352,7 +2352,7 @@ end:
ret <14 x double> %phi
}
-define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) {
+define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v28i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2478,7 +2478,7 @@ end:
ret <28 x i32> %phi
}
-define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v28i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2715,7 +2715,7 @@ end:
ret <28 x i32> %phi
}
-define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
+define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3551,7 +3551,7 @@ end:
ret <56 x i16> %phi
}
-define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v56i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3559,21 +3559,21 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v28, s30, 0
-; SI-NEXT: v_writelane_b32 v28, s31, 1
-; SI-NEXT: v_writelane_b32 v28, s34, 2
-; SI-NEXT: v_writelane_b32 v28, s35, 3
-; SI-NEXT: v_writelane_b32 v28, s36, 4
-; SI-NEXT: v_writelane_b32 v28, s37, 5
-; SI-NEXT: v_writelane_b32 v28, s38, 6
-; SI-NEXT: v_writelane_b32 v28, s39, 7
-; SI-NEXT: v_writelane_b32 v28, s48, 8
-; SI-NEXT: v_writelane_b32 v28, s49, 9
+; SI-NEXT: v_writelane_b32 v28, s34, 0
+; SI-NEXT: v_writelane_b32 v28, s35, 1
+; SI-NEXT: v_writelane_b32 v28, s36, 2
+; SI-NEXT: v_writelane_b32 v28, s37, 3
+; SI-NEXT: v_writelane_b32 v28, s38, 4
+; SI-NEXT: v_writelane_b32 v28, s39, 5
+; SI-NEXT: v_writelane_b32 v28, s48, 6
+; SI-NEXT: v_writelane_b32 v28, s49, 7
+; SI-NEXT: v_writelane_b32 v28, s50, 8
+; SI-NEXT: v_writelane_b32 v28, s51, 9
; SI-NEXT: v_mov_b32_e32 v15, s16
; SI-NEXT: v_mov_b32_e32 v16, s17
; SI-NEXT: v_mov_b32_e32 v17, s18
; SI-NEXT: v_mov_b32_e32 v18, s19
-; SI-NEXT: v_writelane_b32 v28, s50, 10
+; SI-NEXT: v_writelane_b32 v28, s52, 10
; SI-NEXT: v_mov_b32_e32 v19, s20
; SI-NEXT: v_readfirstlane_b32 s44, v15
; SI-NEXT: v_mov_b32_e32 v15, s21
@@ -3583,7 +3583,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v17, s23
; SI-NEXT: v_readfirstlane_b32 s43, v18
; SI-NEXT: v_mov_b32_e32 v18, s24
-; SI-NEXT: v_writelane_b32 v28, s51, 11
+; SI-NEXT: v_writelane_b32 v28, s53, 11
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
; SI-NEXT: v_readfirstlane_b32 s41, v15
@@ -3595,7 +3595,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s22, v18
; SI-NEXT: v_mov_b32_e32 v18, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; SI-NEXT: v_writelane_b32 v28, s52, 12
+; SI-NEXT: v_writelane_b32 v28, s30, 12
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v15
; SI-NEXT: v_readfirstlane_b32 s21, v16
@@ -3616,7 +3616,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v13
-; SI-NEXT: v_writelane_b32 v28, s53, 13
+; SI-NEXT: v_writelane_b32 v28, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s30, s5, 16
@@ -3790,6 +3790,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s44
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v28, 12
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s42
@@ -3818,20 +3819,19 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v25, s7
; SI-NEXT: v_mov_b32_e32 v26, s4
; SI-NEXT: v_mov_b32_e32 v27, s5
-; SI-NEXT: v_readlane_b32 s53, v28, 13
-; SI-NEXT: v_readlane_b32 s52, v28, 12
-; SI-NEXT: v_readlane_b32 s51, v28, 11
-; SI-NEXT: v_readlane_b32 s50, v28, 10
-; SI-NEXT: v_readlane_b32 s49, v28, 9
-; SI-NEXT: v_readlane_b32 s48, v28, 8
-; SI-NEXT: v_readlane_b32 s39, v28, 7
-; SI-NEXT: v_readlane_b32 s38, v28, 6
-; SI-NEXT: v_readlane_b32 s37, v28, 5
-; SI-NEXT: v_readlane_b32 s36, v28, 4
-; SI-NEXT: v_readlane_b32 s35, v28, 3
-; SI-NEXT: v_readlane_b32 s34, v28, 2
-; SI-NEXT: v_readlane_b32 s31, v28, 1
-; SI-NEXT: v_readlane_b32 s30, v28, 0
+; SI-NEXT: v_readlane_b32 s31, v28, 13
+; SI-NEXT: v_readlane_b32 s53, v28, 11
+; SI-NEXT: v_readlane_b32 s52, v28, 10
+; SI-NEXT: v_readlane_b32 s51, v28, 9
+; SI-NEXT: v_readlane_b32 s50, v28, 8
+; SI-NEXT: v_readlane_b32 s49, v28, 7
+; SI-NEXT: v_readlane_b32 s48, v28, 6
+; SI-NEXT: v_readlane_b32 s39, v28, 5
+; SI-NEXT: v_readlane_b32 s38, v28, 4
+; SI-NEXT: v_readlane_b32 s37, v28, 3
+; SI-NEXT: v_readlane_b32 s36, v28, 2
+; SI-NEXT: v_readlane_b32 s35, v28, 1
+; SI-NEXT: v_readlane_b32 s34, v28, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -3878,7 +3878,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v16, s17
; VI-NEXT: v_mov_b32_e32 v17, s18
; VI-NEXT: v_mov_b32_e32 v18, s19
-; VI-NEXT: v_writelane_b32 v28, s30, 0
+; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_mov_b32_e32 v19, s20
; VI-NEXT: v_readfirstlane_b32 s46, v15
; VI-NEXT: v_mov_b32_e32 v15, s21
@@ -3888,7 +3888,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v17, s23
; VI-NEXT: v_readfirstlane_b32 s43, v18
; VI-NEXT: v_mov_b32_e32 v18, s24
-; VI-NEXT: v_writelane_b32 v28, s31, 1
+; VI-NEXT: v_writelane_b32 v28, s35, 1
; VI-NEXT: v_readfirstlane_b32 s42, v19
; VI-NEXT: v_mov_b32_e32 v19, s25
; VI-NEXT: v_readfirstlane_b32 s41, v15
@@ -3900,7 +3900,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s25, v18
; VI-NEXT: v_mov_b32_e32 v18, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; VI-NEXT: v_writelane_b32 v28, s34, 2
+; VI-NEXT: v_writelane_b32 v28, s30, 2
; VI-NEXT: v_readfirstlane_b32 s24, v19
; VI-NEXT: v_readfirstlane_b32 s23, v15
; VI-NEXT: v_readfirstlane_b32 s22, v16
@@ -3921,7 +3921,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s6, v12
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s7, v13
-; VI-NEXT: v_writelane_b32 v28, s35, 3
+; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB13_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s27, s7, 16
@@ -4095,6 +4095,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; VI-NEXT: s_or_b32 s8, s8, s29
; VI-NEXT: s_or_b32 s6, s6, s28
; VI-NEXT: s_or_b32 s7, s7, s27
+; VI-NEXT: v_readlane_b32 s30, v28, 2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s44
@@ -4123,10 +4124,9 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v25, s8
; VI-NEXT: v_mov_b32_e32 v26, s6
; VI-NEXT: v_mov_b32_e32 v27, s7
-; VI-NEXT: v_readlane_b32 s35, v28, 3
-; VI-NEXT: v_readlane_b32 s34, v28, 2
-; VI-NEXT: v_readlane_b32 s31, v28, 1
-; VI-NEXT: v_readlane_b32 s30, v28, 0
+; VI-NEXT: v_readlane_b32 s31, v28, 3
+; VI-NEXT: v_readlane_b32 s35, v28, 1
+; VI-NEXT: v_readlane_b32 s34, v28, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -4611,7 +4611,7 @@ end:
ret <56 x i16> %phi
}
-define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
+define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v28i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5856,7 +5856,7 @@ end:
ret <28 x i32> %phi
}
-define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v28i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6900,7 +6900,7 @@ end:
ret <28 x i32> %phi
}
-define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
+define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7736,7 +7736,7 @@ end:
ret <56 x half> %phi
}
-define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28i32_to_v56f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7744,21 +7744,21 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v28, s30, 0
-; SI-NEXT: v_writelane_b32 v28, s31, 1
-; SI-NEXT: v_writelane_b32 v28, s34, 2
-; SI-NEXT: v_writelane_b32 v28, s35, 3
-; SI-NEXT: v_writelane_b32 v28, s36, 4
-; SI-NEXT: v_writelane_b32 v28, s37, 5
-; SI-NEXT: v_writelane_b32 v28, s38, 6
-; SI-NEXT: v_writelane_b32 v28, s39, 7
-; SI-NEXT: v_writelane_b32 v28, s48, 8
-; SI-NEXT: v_writelane_b32 v28, s49, 9
+; SI-NEXT: v_writelane_b32 v28, s34, 0
+; SI-NEXT: v_writelane_b32 v28, s35, 1
+; SI-NEXT: v_writelane_b32 v28, s36, 2
+; SI-NEXT: v_writelane_b32 v28, s37, 3
+; SI-NEXT: v_writelane_b32 v28, s38, 4
+; SI-NEXT: v_writelane_b32 v28, s39, 5
+; SI-NEXT: v_writelane_b32 v28, s48, 6
+; SI-NEXT: v_writelane_b32 v28, s49, 7
+; SI-NEXT: v_writelane_b32 v28, s50, 8
+; SI-NEXT: v_writelane_b32 v28, s51, 9
; SI-NEXT: v_mov_b32_e32 v15, s16
; SI-NEXT: v_mov_b32_e32 v16, s17
; SI-NEXT: v_mov_b32_e32 v17, s18
; SI-NEXT: v_mov_b32_e32 v18, s19
-; SI-NEXT: v_writelane_b32 v28, s50, 10
+; SI-NEXT: v_writelane_b32 v28, s52, 10
; SI-NEXT: v_mov_b32_e32 v19, s20
; SI-NEXT: v_readfirstlane_b32 s44, v15
; SI-NEXT: v_mov_b32_e32 v15, s21
@@ -7768,7 +7768,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v17, s23
; SI-NEXT: v_readfirstlane_b32 s43, v18
; SI-NEXT: v_mov_b32_e32 v18, s24
-; SI-NEXT: v_writelane_b32 v28, s51, 11
+; SI-NEXT: v_writelane_b32 v28, s53, 11
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
; SI-NEXT: v_readfirstlane_b32 s41, v15
@@ -7780,7 +7780,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s22, v18
; SI-NEXT: v_mov_b32_e32 v18, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; SI-NEXT: v_writelane_b32 v28, s52, 12
+; SI-NEXT: v_writelane_b32 v28, s30, 12
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v15
; SI-NEXT: v_readfirstlane_b32 s21, v16
@@ -7801,7 +7801,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v13
-; SI-NEXT: v_writelane_b32 v28, s53, 13
+; SI-NEXT: v_writelane_b32 v28, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s30, s5, 16
@@ -7975,6 +7975,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s44
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v28, 12
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s42
@@ -8003,20 +8004,19 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v25, s7
; SI-NEXT: v_mov_b32_e32 v26, s4
; SI-NEXT: v_mov_b32_e32 v27, s5
-; SI-NEXT: v_readlane_b32 s53, v28, 13
-; SI-NEXT: v_readlane_b32 s52, v28, 12
-; SI-NEXT: v_readlane_b32 s51, v28, 11
-; SI-NEXT: v_readlane_b32 s50, v28, 10
-; SI-NEXT: v_readlane_b32 s49, v28, 9
-; SI-NEXT: v_readlane_b32 s48, v28, 8
-; SI-NEXT: v_readlane_b32 s39, v28, 7
-; SI-NEXT: v_readlane_b32 s38, v28, 6
-; SI-NEXT: v_readlane_b32 s37, v28, 5
-; SI-NEXT: v_readlane_b32 s36, v28, 4
-; SI-NEXT: v_readlane_b32 s35, v28, 3
-; SI-NEXT: v_readlane_b32 s34, v28, 2
-; SI-NEXT: v_readlane_b32 s31, v28, 1
-; SI-NEXT: v_readlane_b32 s30, v28, 0
+; SI-NEXT: v_readlane_b32 s31, v28, 13
+; SI-NEXT: v_readlane_b32 s53, v28, 11
+; SI-NEXT: v_readlane_b32 s52, v28, 10
+; SI-NEXT: v_readlane_b32 s51, v28, 9
+; SI-NEXT: v_readlane_b32 s50, v28, 8
+; SI-NEXT: v_readlane_b32 s49, v28, 7
+; SI-NEXT: v_readlane_b32 s48, v28, 6
+; SI-NEXT: v_readlane_b32 s39, v28, 5
+; SI-NEXT: v_readlane_b32 s38, v28, 4
+; SI-NEXT: v_readlane_b32 s37, v28, 3
+; SI-NEXT: v_readlane_b32 s36, v28, 2
+; SI-NEXT: v_readlane_b32 s35, v28, 1
+; SI-NEXT: v_readlane_b32 s34, v28, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -8063,7 +8063,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v16, s17
; VI-NEXT: v_mov_b32_e32 v17, s18
; VI-NEXT: v_mov_b32_e32 v18, s19
-; VI-NEXT: v_writelane_b32 v28, s30, 0
+; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_mov_b32_e32 v19, s20
; VI-NEXT: v_readfirstlane_b32 s46, v15
; VI-NEXT: v_mov_b32_e32 v15, s21
@@ -8073,7 +8073,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v17, s23
; VI-NEXT: v_readfirstlane_b32 s43, v18
; VI-NEXT: v_mov_b32_e32 v18, s24
-; VI-NEXT: v_writelane_b32 v28, s31, 1
+; VI-NEXT: v_writelane_b32 v28, s35, 1
; VI-NEXT: v_readfirstlane_b32 s42, v19
; VI-NEXT: v_mov_b32_e32 v19, s25
; VI-NEXT: v_readfirstlane_b32 s41, v15
@@ -8085,7 +8085,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s25, v18
; VI-NEXT: v_mov_b32_e32 v18, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; VI-NEXT: v_writelane_b32 v28, s34, 2
+; VI-NEXT: v_writelane_b32 v28, s30, 2
; VI-NEXT: v_readfirstlane_b32 s24, v19
; VI-NEXT: v_readfirstlane_b32 s23, v15
; VI-NEXT: v_readfirstlane_b32 s22, v16
@@ -8106,7 +8106,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s6, v12
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s7, v13
-; VI-NEXT: v_writelane_b32 v28, s35, 3
+; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB17_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s27, s7, 16
@@ -8280,6 +8280,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; VI-NEXT: s_or_b32 s8, s8, s29
; VI-NEXT: s_or_b32 s6, s6, s28
; VI-NEXT: s_or_b32 s7, s7, s27
+; VI-NEXT: v_readlane_b32 s30, v28, 2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s44
@@ -8308,10 +8309,9 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v25, s8
; VI-NEXT: v_mov_b32_e32 v26, s6
; VI-NEXT: v_mov_b32_e32 v27, s7
-; VI-NEXT: v_readlane_b32 s35, v28, 3
-; VI-NEXT: v_readlane_b32 s34, v28, 2
-; VI-NEXT: v_readlane_b32 s31, v28, 1
-; VI-NEXT: v_readlane_b32 s30, v28, 0
+; VI-NEXT: v_readlane_b32 s31, v28, 3
+; VI-NEXT: v_readlane_b32 s35, v28, 1
+; VI-NEXT: v_readlane_b32 s34, v28, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -8796,7 +8796,7 @@ end:
ret <56 x half> %phi
}
-define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
+define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v28i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10178,7 +10178,7 @@ end:
ret <28 x i32> %phi
}
-define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v28i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11272,7 +11272,7 @@ end:
ret <28 x i32> %phi
}
-define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) {
+define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v14i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11440,7 +11440,7 @@ end:
ret <14 x i64> %phi
}
-define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v14i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11716,7 +11716,7 @@ end:
ret <14 x i64> %phi
}
-define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) {
+define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v28f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11905,7 +11905,7 @@ end:
ret <28 x float> %phi
}
-define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v28f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12202,7 +12202,7 @@ end:
ret <28 x float> %phi
}
-define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) {
+define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v14f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12370,7 +12370,7 @@ end:
ret <14 x double> %phi
}
-define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v14f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12646,7 +12646,7 @@ end:
ret <14 x double> %phi
}
-define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) {
+define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v28f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12772,7 +12772,7 @@ end:
ret <28 x float> %phi
}
-define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v28f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13009,7 +13009,7 @@ end:
ret <28 x float> %phi
}
-define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
+define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13817,7 +13817,7 @@ end:
ret <56 x i16> %phi
}
-define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v56i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14935,7 +14935,7 @@ end:
ret <56 x i16> %phi
}
-define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
+define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v28f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16180,7 +16180,7 @@ end:
ret <28 x float> %phi
}
-define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v28f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17224,7 +17224,7 @@ end:
ret <28 x float> %phi
}
-define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
+define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18032,7 +18032,7 @@ end:
ret <56 x half> %phi
}
-define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v28f32_to_v56f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19150,7 +19150,7 @@ end:
ret <56 x half> %phi
}
-define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
+define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v28f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20532,7 +20532,7 @@ end:
ret <28 x float> %phi
}
-define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v28f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21626,7 +21626,7 @@ end:
ret <28 x float> %phi
}
-define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) {
+define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v14f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -21815,7 +21815,7 @@ end:
ret <14 x double> %phi
}
-define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v14f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22112,7 +22112,7 @@ end:
ret <14 x double> %phi
}
-define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) {
+define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v14i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22238,7 +22238,7 @@ end:
ret <14 x i64> %phi
}
-define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v14i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22475,7 +22475,7 @@ end:
ret <14 x i64> %phi
}
-define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
+define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23325,7 +23325,7 @@ end:
ret <56 x i16> %phi
}
-define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v56i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23333,21 +23333,21 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v28, s30, 0
-; SI-NEXT: v_writelane_b32 v28, s31, 1
-; SI-NEXT: v_writelane_b32 v28, s34, 2
-; SI-NEXT: v_writelane_b32 v28, s35, 3
-; SI-NEXT: v_writelane_b32 v28, s36, 4
-; SI-NEXT: v_writelane_b32 v28, s37, 5
-; SI-NEXT: v_writelane_b32 v28, s38, 6
-; SI-NEXT: v_writelane_b32 v28, s39, 7
-; SI-NEXT: v_writelane_b32 v28, s48, 8
-; SI-NEXT: v_writelane_b32 v28, s49, 9
+; SI-NEXT: v_writelane_b32 v28, s34, 0
+; SI-NEXT: v_writelane_b32 v28, s35, 1
+; SI-NEXT: v_writelane_b32 v28, s36, 2
+; SI-NEXT: v_writelane_b32 v28, s37, 3
+; SI-NEXT: v_writelane_b32 v28, s38, 4
+; SI-NEXT: v_writelane_b32 v28, s39, 5
+; SI-NEXT: v_writelane_b32 v28, s48, 6
+; SI-NEXT: v_writelane_b32 v28, s49, 7
+; SI-NEXT: v_writelane_b32 v28, s50, 8
+; SI-NEXT: v_writelane_b32 v28, s51, 9
; SI-NEXT: v_mov_b32_e32 v15, s16
; SI-NEXT: v_mov_b32_e32 v16, s17
; SI-NEXT: v_mov_b32_e32 v17, s18
; SI-NEXT: v_mov_b32_e32 v18, s19
-; SI-NEXT: v_writelane_b32 v28, s50, 10
+; SI-NEXT: v_writelane_b32 v28, s52, 10
; SI-NEXT: v_mov_b32_e32 v19, s20
; SI-NEXT: v_readfirstlane_b32 s44, v15
; SI-NEXT: v_mov_b32_e32 v15, s21
@@ -23357,7 +23357,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v17, s23
; SI-NEXT: v_readfirstlane_b32 s43, v18
; SI-NEXT: v_mov_b32_e32 v18, s24
-; SI-NEXT: v_writelane_b32 v28, s51, 11
+; SI-NEXT: v_writelane_b32 v28, s53, 11
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
; SI-NEXT: v_readfirstlane_b32 s41, v15
@@ -23369,7 +23369,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s22, v18
; SI-NEXT: v_mov_b32_e32 v18, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; SI-NEXT: v_writelane_b32 v28, s52, 12
+; SI-NEXT: v_writelane_b32 v28, s30, 12
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v15
; SI-NEXT: v_readfirstlane_b32 s21, v16
@@ -23390,7 +23390,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v13
-; SI-NEXT: v_writelane_b32 v28, s53, 13
+; SI-NEXT: v_writelane_b32 v28, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s30, s5, 16
@@ -23564,6 +23564,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s44
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v28, 12
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s42
@@ -23592,20 +23593,19 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v25, s7
; SI-NEXT: v_mov_b32_e32 v26, s4
; SI-NEXT: v_mov_b32_e32 v27, s5
-; SI-NEXT: v_readlane_b32 s53, v28, 13
-; SI-NEXT: v_readlane_b32 s52, v28, 12
-; SI-NEXT: v_readlane_b32 s51, v28, 11
-; SI-NEXT: v_readlane_b32 s50, v28, 10
-; SI-NEXT: v_readlane_b32 s49, v28, 9
-; SI-NEXT: v_readlane_b32 s48, v28, 8
-; SI-NEXT: v_readlane_b32 s39, v28, 7
-; SI-NEXT: v_readlane_b32 s38, v28, 6
-; SI-NEXT: v_readlane_b32 s37, v28, 5
-; SI-NEXT: v_readlane_b32 s36, v28, 4
-; SI-NEXT: v_readlane_b32 s35, v28, 3
-; SI-NEXT: v_readlane_b32 s34, v28, 2
-; SI-NEXT: v_readlane_b32 s31, v28, 1
-; SI-NEXT: v_readlane_b32 s30, v28, 0
+; SI-NEXT: v_readlane_b32 s31, v28, 13
+; SI-NEXT: v_readlane_b32 s53, v28, 11
+; SI-NEXT: v_readlane_b32 s52, v28, 10
+; SI-NEXT: v_readlane_b32 s51, v28, 9
+; SI-NEXT: v_readlane_b32 s50, v28, 8
+; SI-NEXT: v_readlane_b32 s49, v28, 7
+; SI-NEXT: v_readlane_b32 s48, v28, 6
+; SI-NEXT: v_readlane_b32 s39, v28, 5
+; SI-NEXT: v_readlane_b32 s38, v28, 4
+; SI-NEXT: v_readlane_b32 s37, v28, 3
+; SI-NEXT: v_readlane_b32 s36, v28, 2
+; SI-NEXT: v_readlane_b32 s35, v28, 1
+; SI-NEXT: v_readlane_b32 s34, v28, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -23652,7 +23652,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v16, s17
; VI-NEXT: v_mov_b32_e32 v17, s18
; VI-NEXT: v_mov_b32_e32 v18, s19
-; VI-NEXT: v_writelane_b32 v28, s30, 0
+; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_mov_b32_e32 v19, s20
; VI-NEXT: v_readfirstlane_b32 s46, v15
; VI-NEXT: v_mov_b32_e32 v15, s21
@@ -23662,7 +23662,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v17, s23
; VI-NEXT: v_readfirstlane_b32 s43, v18
; VI-NEXT: v_mov_b32_e32 v18, s24
-; VI-NEXT: v_writelane_b32 v28, s31, 1
+; VI-NEXT: v_writelane_b32 v28, s35, 1
; VI-NEXT: v_readfirstlane_b32 s42, v19
; VI-NEXT: v_mov_b32_e32 v19, s25
; VI-NEXT: v_readfirstlane_b32 s41, v15
@@ -23674,7 +23674,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s25, v18
; VI-NEXT: v_mov_b32_e32 v18, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; VI-NEXT: v_writelane_b32 v28, s34, 2
+; VI-NEXT: v_writelane_b32 v28, s30, 2
; VI-NEXT: v_readfirstlane_b32 s24, v19
; VI-NEXT: v_readfirstlane_b32 s23, v15
; VI-NEXT: v_readfirstlane_b32 s22, v16
@@ -23695,7 +23695,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s6, v12
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s7, v13
-; VI-NEXT: v_writelane_b32 v28, s35, 3
+; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB41_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s27, s7, 16
@@ -23869,6 +23869,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; VI-NEXT: s_or_b32 s8, s8, s29
; VI-NEXT: s_or_b32 s6, s6, s28
; VI-NEXT: s_or_b32 s7, s7, s27
+; VI-NEXT: v_readlane_b32 s30, v28, 2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s44
@@ -23897,10 +23898,9 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v25, s8
; VI-NEXT: v_mov_b32_e32 v26, s6
; VI-NEXT: v_mov_b32_e32 v27, s7
-; VI-NEXT: v_readlane_b32 s35, v28, 3
-; VI-NEXT: v_readlane_b32 s34, v28, 2
-; VI-NEXT: v_readlane_b32 s31, v28, 1
-; VI-NEXT: v_readlane_b32 s30, v28, 0
+; VI-NEXT: v_readlane_b32 s31, v28, 3
+; VI-NEXT: v_readlane_b32 s35, v28, 1
+; VI-NEXT: v_readlane_b32 s34, v28, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -24385,7 +24385,7 @@ end:
ret <56 x i16> %phi
}
-define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
+define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v14i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25630,7 +25630,7 @@ end:
ret <14 x i64> %phi
}
-define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v14i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26674,7 +26674,7 @@ end:
ret <14 x i64> %phi
}
-define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
+define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27524,7 +27524,7 @@ end:
ret <56 x half> %phi
}
-define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14i64_to_v56f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27532,21 +27532,21 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v28, s30, 0
-; SI-NEXT: v_writelane_b32 v28, s31, 1
-; SI-NEXT: v_writelane_b32 v28, s34, 2
-; SI-NEXT: v_writelane_b32 v28, s35, 3
-; SI-NEXT: v_writelane_b32 v28, s36, 4
-; SI-NEXT: v_writelane_b32 v28, s37, 5
-; SI-NEXT: v_writelane_b32 v28, s38, 6
-; SI-NEXT: v_writelane_b32 v28, s39, 7
-; SI-NEXT: v_writelane_b32 v28, s48, 8
-; SI-NEXT: v_writelane_b32 v28, s49, 9
+; SI-NEXT: v_writelane_b32 v28, s34, 0
+; SI-NEXT: v_writelane_b32 v28, s35, 1
+; SI-NEXT: v_writelane_b32 v28, s36, 2
+; SI-NEXT: v_writelane_b32 v28, s37, 3
+; SI-NEXT: v_writelane_b32 v28, s38, 4
+; SI-NEXT: v_writelane_b32 v28, s39, 5
+; SI-NEXT: v_writelane_b32 v28, s48, 6
+; SI-NEXT: v_writelane_b32 v28, s49, 7
+; SI-NEXT: v_writelane_b32 v28, s50, 8
+; SI-NEXT: v_writelane_b32 v28, s51, 9
; SI-NEXT: v_mov_b32_e32 v15, s16
; SI-NEXT: v_mov_b32_e32 v16, s17
; SI-NEXT: v_mov_b32_e32 v17, s18
; SI-NEXT: v_mov_b32_e32 v18, s19
-; SI-NEXT: v_writelane_b32 v28, s50, 10
+; SI-NEXT: v_writelane_b32 v28, s52, 10
; SI-NEXT: v_mov_b32_e32 v19, s20
; SI-NEXT: v_readfirstlane_b32 s44, v15
; SI-NEXT: v_mov_b32_e32 v15, s21
@@ -27556,7 +27556,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v17, s23
; SI-NEXT: v_readfirstlane_b32 s43, v18
; SI-NEXT: v_mov_b32_e32 v18, s24
-; SI-NEXT: v_writelane_b32 v28, s51, 11
+; SI-NEXT: v_writelane_b32 v28, s53, 11
; SI-NEXT: v_readfirstlane_b32 s40, v19
; SI-NEXT: v_mov_b32_e32 v19, s25
; SI-NEXT: v_readfirstlane_b32 s41, v15
@@ -27568,7 +27568,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s22, v18
; SI-NEXT: v_mov_b32_e32 v18, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; SI-NEXT: v_writelane_b32 v28, s52, 12
+; SI-NEXT: v_writelane_b32 v28, s30, 12
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v15
; SI-NEXT: v_readfirstlane_b32 s21, v16
@@ -27589,7 +27589,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v12
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v13
-; SI-NEXT: v_writelane_b32 v28, s53, 13
+; SI-NEXT: v_writelane_b32 v28, s31, 13
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s30, s5, 16
@@ -27763,6 +27763,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s44
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v28, 12
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s42
@@ -27791,20 +27792,19 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v25, s7
; SI-NEXT: v_mov_b32_e32 v26, s4
; SI-NEXT: v_mov_b32_e32 v27, s5
-; SI-NEXT: v_readlane_b32 s53, v28, 13
-; SI-NEXT: v_readlane_b32 s52, v28, 12
-; SI-NEXT: v_readlane_b32 s51, v28, 11
-; SI-NEXT: v_readlane_b32 s50, v28, 10
-; SI-NEXT: v_readlane_b32 s49, v28, 9
-; SI-NEXT: v_readlane_b32 s48, v28, 8
-; SI-NEXT: v_readlane_b32 s39, v28, 7
-; SI-NEXT: v_readlane_b32 s38, v28, 6
-; SI-NEXT: v_readlane_b32 s37, v28, 5
-; SI-NEXT: v_readlane_b32 s36, v28, 4
-; SI-NEXT: v_readlane_b32 s35, v28, 3
-; SI-NEXT: v_readlane_b32 s34, v28, 2
-; SI-NEXT: v_readlane_b32 s31, v28, 1
-; SI-NEXT: v_readlane_b32 s30, v28, 0
+; SI-NEXT: v_readlane_b32 s31, v28, 13
+; SI-NEXT: v_readlane_b32 s53, v28, 11
+; SI-NEXT: v_readlane_b32 s52, v28, 10
+; SI-NEXT: v_readlane_b32 s51, v28, 9
+; SI-NEXT: v_readlane_b32 s50, v28, 8
+; SI-NEXT: v_readlane_b32 s49, v28, 7
+; SI-NEXT: v_readlane_b32 s48, v28, 6
+; SI-NEXT: v_readlane_b32 s39, v28, 5
+; SI-NEXT: v_readlane_b32 s38, v28, 4
+; SI-NEXT: v_readlane_b32 s37, v28, 3
+; SI-NEXT: v_readlane_b32 s36, v28, 2
+; SI-NEXT: v_readlane_b32 s35, v28, 1
+; SI-NEXT: v_readlane_b32 s34, v28, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -27851,7 +27851,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v16, s17
; VI-NEXT: v_mov_b32_e32 v17, s18
; VI-NEXT: v_mov_b32_e32 v18, s19
-; VI-NEXT: v_writelane_b32 v28, s30, 0
+; VI-NEXT: v_writelane_b32 v28, s34, 0
; VI-NEXT: v_mov_b32_e32 v19, s20
; VI-NEXT: v_readfirstlane_b32 s46, v15
; VI-NEXT: v_mov_b32_e32 v15, s21
@@ -27861,7 +27861,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v17, s23
; VI-NEXT: v_readfirstlane_b32 s43, v18
; VI-NEXT: v_mov_b32_e32 v18, s24
-; VI-NEXT: v_writelane_b32 v28, s31, 1
+; VI-NEXT: v_writelane_b32 v28, s35, 1
; VI-NEXT: v_readfirstlane_b32 s42, v19
; VI-NEXT: v_mov_b32_e32 v19, s25
; VI-NEXT: v_readfirstlane_b32 s41, v15
@@ -27873,7 +27873,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s25, v18
; VI-NEXT: v_mov_b32_e32 v18, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; VI-NEXT: v_writelane_b32 v28, s34, 2
+; VI-NEXT: v_writelane_b32 v28, s30, 2
; VI-NEXT: v_readfirstlane_b32 s24, v19
; VI-NEXT: v_readfirstlane_b32 s23, v15
; VI-NEXT: v_readfirstlane_b32 s22, v16
@@ -27894,7 +27894,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s6, v12
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s7, v13
-; VI-NEXT: v_writelane_b32 v28, s35, 3
+; VI-NEXT: v_writelane_b32 v28, s31, 3
; VI-NEXT: s_cbranch_scc0 .LBB45_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s27, s7, 16
@@ -28068,6 +28068,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; VI-NEXT: s_or_b32 s8, s8, s29
; VI-NEXT: s_or_b32 s6, s6, s28
; VI-NEXT: s_or_b32 s7, s7, s27
+; VI-NEXT: v_readlane_b32 s30, v28, 2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s44
@@ -28096,10 +28097,9 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v25, s8
; VI-NEXT: v_mov_b32_e32 v26, s6
; VI-NEXT: v_mov_b32_e32 v27, s7
-; VI-NEXT: v_readlane_b32 s35, v28, 3
-; VI-NEXT: v_readlane_b32 s34, v28, 2
-; VI-NEXT: v_readlane_b32 s31, v28, 1
-; VI-NEXT: v_readlane_b32 s30, v28, 0
+; VI-NEXT: v_readlane_b32 s31, v28, 3
+; VI-NEXT: v_readlane_b32 s35, v28, 1
+; VI-NEXT: v_readlane_b32 s34, v28, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -28584,7 +28584,7 @@ end:
ret <56 x half> %phi
}
-define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
+define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v14i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29966,7 +29966,7 @@ end:
ret <14 x i64> %phi
}
-define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v14i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31060,7 +31060,7 @@ end:
ret <14 x i64> %phi
}
-define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
+define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31826,7 +31826,7 @@ end:
ret <56 x i16> %phi
}
-define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v56i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32898,7 +32898,7 @@ end:
ret <56 x i16> %phi
}
-define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
+define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v14f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34143,7 +34143,7 @@ end:
ret <14 x double> %phi
}
-define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v14f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35187,7 +35187,7 @@ end:
ret <14 x double> %phi
}
-define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
+define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35953,7 +35953,7 @@ end:
ret <56 x half> %phi
}
-define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v14f64_to_v56f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37025,7 +37025,7 @@ end:
ret <56 x half> %phi
}
-define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
+define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v14f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38407,7 +38407,7 @@ end:
ret <14 x double> %phi
}
-define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v14f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39501,7 +39501,7 @@ end:
ret <14 x double> %phi
}
-define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
+define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v56f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40852,7 +40852,7 @@ end:
ret <56 x half> %phi
}
-define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i32 inreg %b) {
+define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56i16_to_v56f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40861,40 +40861,39 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v28, s30, 0
-; SI-NEXT: v_writelane_b32 v28, s31, 1
-; SI-NEXT: v_writelane_b32 v28, s34, 2
-; SI-NEXT: v_writelane_b32 v28, s35, 3
-; SI-NEXT: v_writelane_b32 v28, s36, 4
-; SI-NEXT: v_writelane_b32 v28, s37, 5
-; SI-NEXT: v_writelane_b32 v28, s38, 6
-; SI-NEXT: v_writelane_b32 v28, s39, 7
-; SI-NEXT: v_writelane_b32 v28, s48, 8
-; SI-NEXT: v_writelane_b32 v28, s49, 9
-; SI-NEXT: v_writelane_b32 v28, s50, 10
-; SI-NEXT: v_writelane_b32 v28, s51, 11
-; SI-NEXT: v_writelane_b32 v28, s52, 12
-; SI-NEXT: v_writelane_b32 v28, s53, 13
-; SI-NEXT: v_writelane_b32 v28, s54, 14
-; SI-NEXT: v_writelane_b32 v28, s55, 15
-; SI-NEXT: v_writelane_b32 v28, s64, 16
-; SI-NEXT: v_writelane_b32 v28, s65, 17
-; SI-NEXT: v_writelane_b32 v28, s66, 18
-; SI-NEXT: v_writelane_b32 v28, s67, 19
-; SI-NEXT: v_writelane_b32 v28, s68, 20
-; SI-NEXT: v_writelane_b32 v28, s69, 21
-; SI-NEXT: v_writelane_b32 v28, s70, 22
-; SI-NEXT: v_writelane_b32 v28, s71, 23
-; SI-NEXT: v_writelane_b32 v28, s80, 24
-; SI-NEXT: v_writelane_b32 v28, s81, 25
-; SI-NEXT: v_writelane_b32 v28, s82, 26
-; SI-NEXT: v_writelane_b32 v28, s83, 27
-; SI-NEXT: v_writelane_b32 v28, s84, 28
-; SI-NEXT: v_writelane_b32 v28, s85, 29
-; SI-NEXT: v_writelane_b32 v28, s86, 30
-; SI-NEXT: v_writelane_b32 v28, s87, 31
+; SI-NEXT: v_writelane_b32 v28, s34, 0
+; SI-NEXT: v_writelane_b32 v28, s35, 1
+; SI-NEXT: v_writelane_b32 v28, s36, 2
+; SI-NEXT: v_writelane_b32 v28, s37, 3
+; SI-NEXT: v_writelane_b32 v28, s38, 4
+; SI-NEXT: v_writelane_b32 v28, s39, 5
+; SI-NEXT: v_writelane_b32 v28, s48, 6
+; SI-NEXT: v_writelane_b32 v28, s49, 7
+; SI-NEXT: v_writelane_b32 v28, s50, 8
+; SI-NEXT: v_writelane_b32 v28, s51, 9
+; SI-NEXT: v_writelane_b32 v28, s52, 10
+; SI-NEXT: v_writelane_b32 v28, s53, 11
+; SI-NEXT: v_writelane_b32 v28, s54, 12
+; SI-NEXT: v_writelane_b32 v28, s55, 13
+; SI-NEXT: v_writelane_b32 v28, s64, 14
+; SI-NEXT: v_writelane_b32 v28, s65, 15
+; SI-NEXT: v_writelane_b32 v28, s66, 16
+; SI-NEXT: v_writelane_b32 v28, s67, 17
+; SI-NEXT: v_writelane_b32 v28, s68, 18
+; SI-NEXT: v_writelane_b32 v28, s69, 19
+; SI-NEXT: v_writelane_b32 v28, s70, 20
+; SI-NEXT: v_writelane_b32 v28, s71, 21
+; SI-NEXT: v_writelane_b32 v28, s80, 22
+; SI-NEXT: v_writelane_b32 v28, s81, 23
+; SI-NEXT: v_writelane_b32 v28, s82, 24
+; SI-NEXT: v_writelane_b32 v28, s83, 25
+; SI-NEXT: v_writelane_b32 v28, s84, 26
+; SI-NEXT: v_writelane_b32 v28, s85, 27
+; SI-NEXT: v_writelane_b32 v28, s86, 28
+; SI-NEXT: v_writelane_b32 v28, s87, 29
+; SI-NEXT: v_writelane_b32 v28, s96, 30
; SI-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v28, s96, 32
+; SI-NEXT: v_writelane_b32 v28, s97, 31
; SI-NEXT: s_lshr_b32 s66, s29, 16
; SI-NEXT: s_lshr_b32 s93, s28, 16
; SI-NEXT: s_lshr_b32 s65, s27, 16
@@ -40912,24 +40911,25 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v29, s17, 0
-; SI-NEXT: v_writelane_b32 v28, s97, 33
+; SI-NEXT: v_writelane_b32 v28, s98, 32
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9
; SI-NEXT: v_readfirstlane_b32 s50, v9
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_writelane_b32 v29, s16, 1
-; SI-NEXT: v_writelane_b32 v28, s98, 34
+; SI-NEXT: v_writelane_b32 v28, s99, 33
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11
; SI-NEXT: v_readfirstlane_b32 s97, v11
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6
; SI-NEXT: v_readfirstlane_b32 s5, v9
; SI-NEXT: v_writelane_b32 v29, s19, 2
-; SI-NEXT: v_writelane_b32 v28, s99, 35
+; SI-NEXT: v_writelane_b32 v28, s30, 34
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13
; SI-NEXT: v_readfirstlane_b32 s99, v13
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; SI-NEXT: v_readfirstlane_b32 s7, v11
; SI-NEXT: v_writelane_b32 v29, s5, 3
+; SI-NEXT: v_writelane_b32 v28, s31, 35
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10
; SI-NEXT: v_readfirstlane_b32 s84, v12
@@ -41380,6 +41380,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s11, s11, 0xffff
; SI-NEXT: s_lshl_b32 s42, s81, 16
; SI-NEXT: s_or_b32 s11, s11, s42
+; SI-NEXT: v_readlane_b32 s30, v28, 34
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s6
@@ -41408,42 +41409,41 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v25, s13
; SI-NEXT: v_mov_b32_e32 v26, s10
; SI-NEXT: v_mov_b32_e32 v27, s11
-; SI-NEXT: v_readlane_b32 s99, v28, 35
-; SI-NEXT: v_readlane_b32 s98, v28, 34
-; SI-NEXT: v_readlane_b32 s97, v28, 33
-; SI-NEXT: v_readlane_b32 s96, v28, 32
-; SI-NEXT: v_readlane_b32 s87, v28, 31
-; SI-NEXT: v_readlane_b32 s86, v28, 30
-; SI-NEXT: v_readlane_b32 s85, v28, 29
-; SI-NEXT: v_readlane_b32 s84, v28, 28
-; SI-NEXT: v_readlane_b32 s83, v28, 27
-; SI-NEXT: v_readlane_b32 s82, v28, 26
-; SI-NEXT: v_readlane_b32 s81, v28, 25
-; SI-NEXT: v_readlane_b32 s80, v28, 24
-; SI-NEXT: v_readlane_b32 s71, v28, 23
-; SI-NEXT: v_readlane_b32 s70, v28, 22
-; SI-NEXT: v_readlane_b32 s69, v28, 21
-; SI-NEXT: v_readlane_b32 s68, v28, 20
-; SI-NEXT: v_readlane_b32 s67, v28, 19
-; SI-NEXT: v_readlane_b32 s66, v28, 18
-; SI-NEXT: v_readlane_b32 s65, v28, 17
-; SI-NEXT: v_readlane_b32 s64, v28, 16
-; SI-NEXT: v_readlane_b32 s55, v28, 15
-; SI-NEXT: v_readlane_b32 s54, v28, 14
-; SI-NEXT: v_readlane_b32 s53, v28, 13
-; SI-NEXT: v_readlane_b32 s52, v28, 12
-; SI-NEXT: v_readlane_b32 s51, v28, 11
-; SI-NEXT: v_readlane_b32 s50, v28, 10
-; SI-NEXT: v_readlane_b32 s49, v28, 9
-; SI-NEXT: v_readlane_b32 s48, v28, 8
-; SI-NEXT: v_readlane_b32 s39, v28, 7
-; SI-NEXT: v_readlane_b32 s38, v28, 6
-; SI-NEXT: v_readlane_b32 s37, v28, 5
-; SI-NEXT: v_readlane_b32 s36, v28, 4
-; SI-NEXT: v_readlane_b32 s35, v28, 3
-; SI-NEXT: v_readlane_b32 s34, v28, 2
-; SI-NEXT: v_readlane_b32 s31, v28, 1
-; SI-NEXT: v_readlane_b32 s30, v28, 0
+; SI-NEXT: v_readlane_b32 s31, v28, 35
+; SI-NEXT: v_readlane_b32 s99, v28, 33
+; SI-NEXT: v_readlane_b32 s98, v28, 32
+; SI-NEXT: v_readlane_b32 s97, v28, 31
+; SI-NEXT: v_readlane_b32 s96, v28, 30
+; SI-NEXT: v_readlane_b32 s87, v28, 29
+; SI-NEXT: v_readlane_b32 s86, v28, 28
+; SI-NEXT: v_readlane_b32 s85, v28, 27
+; SI-NEXT: v_readlane_b32 s84, v28, 26
+; SI-NEXT: v_readlane_b32 s83, v28, 25
+; SI-NEXT: v_readlane_b32 s82, v28, 24
+; SI-NEXT: v_readlane_b32 s81, v28, 23
+; SI-NEXT: v_readlane_b32 s80, v28, 22
+; SI-NEXT: v_readlane_b32 s71, v28, 21
+; SI-NEXT: v_readlane_b32 s70, v28, 20
+; SI-NEXT: v_readlane_b32 s69, v28, 19
+; SI-NEXT: v_readlane_b32 s68, v28, 18
+; SI-NEXT: v_readlane_b32 s67, v28, 17
+; SI-NEXT: v_readlane_b32 s66, v28, 16
+; SI-NEXT: v_readlane_b32 s65, v28, 15
+; SI-NEXT: v_readlane_b32 s64, v28, 14
+; SI-NEXT: v_readlane_b32 s55, v28, 13
+; SI-NEXT: v_readlane_b32 s54, v28, 12
+; SI-NEXT: v_readlane_b32 s53, v28, 11
+; SI-NEXT: v_readlane_b32 s52, v28, 10
+; SI-NEXT: v_readlane_b32 s51, v28, 9
+; SI-NEXT: v_readlane_b32 s50, v28, 8
+; SI-NEXT: v_readlane_b32 s49, v28, 7
+; SI-NEXT: v_readlane_b32 s48, v28, 6
+; SI-NEXT: v_readlane_b32 s39, v28, 5
+; SI-NEXT: v_readlane_b32 s38, v28, 4
+; SI-NEXT: v_readlane_b32 s37, v28, 3
+; SI-NEXT: v_readlane_b32 s36, v28, 2
+; SI-NEXT: v_readlane_b32 s35, v28, 1
+; SI-NEXT: v_readlane_b32 s34, v28, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -42364,7 +42364,7 @@ end:
ret <56 x half> %phi
}
-define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
+define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v56i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43322,7 +43322,7 @@ end:
ret <56 x i16> %phi
}
-define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i32 inreg %b) {
+define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v56f16_to_v56i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44716,3 +44716,5 @@ end:
%phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <56 x i16> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 4fe874215b3f8..fff8e0bfb619c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) {
+define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v30f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -196,7 +196,7 @@ end:
ret <30 x float> %phi
}
-define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v30f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -501,7 +501,7 @@ end:
ret <30 x float> %phi
}
-define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) {
+define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v30i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -676,7 +676,7 @@ end:
ret <30 x i32> %phi
}
-define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v30i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -966,7 +966,7 @@ end:
ret <30 x i32> %phi
}
-define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) {
+define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v15i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1156,7 +1156,7 @@ end:
ret <15 x i64> %phi
}
-define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v15i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1461,7 +1461,7 @@ end:
ret <15 x i64> %phi
}
-define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) {
+define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v30i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1659,7 +1659,7 @@ end:
ret <30 x i32> %phi
}
-define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v30i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1972,7 +1972,7 @@ end:
ret <30 x i32> %phi
}
-define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) {
+define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v15f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2162,7 +2162,7 @@ end:
ret <15 x double> %phi
}
-define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v15f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2467,7 +2467,7 @@ end:
ret <15 x double> %phi
}
-define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) {
+define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v30i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2597,7 +2597,7 @@ end:
ret <30 x i32> %phi
}
-define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v30i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2845,7 +2845,7 @@ end:
ret <30 x i32> %phi
}
-define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
+define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3753,7 +3753,7 @@ end:
ret <60 x i16> %phi
}
-define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v60i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3761,40 +3761,40 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v30, s30, 0
-; SI-NEXT: v_writelane_b32 v30, s31, 1
-; SI-NEXT: v_writelane_b32 v30, s34, 2
-; SI-NEXT: v_writelane_b32 v30, s35, 3
-; SI-NEXT: v_writelane_b32 v30, s36, 4
-; SI-NEXT: v_writelane_b32 v30, s37, 5
-; SI-NEXT: v_writelane_b32 v30, s38, 6
-; SI-NEXT: v_writelane_b32 v30, s39, 7
-; SI-NEXT: v_writelane_b32 v30, s48, 8
-; SI-NEXT: v_writelane_b32 v30, s49, 9
-; SI-NEXT: v_writelane_b32 v30, s50, 10
+; SI-NEXT: v_writelane_b32 v30, s34, 0
+; SI-NEXT: v_writelane_b32 v30, s35, 1
+; SI-NEXT: v_writelane_b32 v30, s36, 2
+; SI-NEXT: v_writelane_b32 v30, s37, 3
+; SI-NEXT: v_writelane_b32 v30, s38, 4
+; SI-NEXT: v_writelane_b32 v30, s39, 5
+; SI-NEXT: v_writelane_b32 v30, s48, 6
+; SI-NEXT: v_writelane_b32 v30, s49, 7
+; SI-NEXT: v_writelane_b32 v30, s50, 8
+; SI-NEXT: v_writelane_b32 v30, s51, 9
+; SI-NEXT: v_writelane_b32 v30, s52, 10
; SI-NEXT: v_mov_b32_e32 v17, s16
; SI-NEXT: v_mov_b32_e32 v18, s17
-; SI-NEXT: v_writelane_b32 v30, s51, 11
+; SI-NEXT: v_writelane_b32 v30, s53, 11
; SI-NEXT: v_mov_b32_e32 v19, s18
; SI-NEXT: v_readfirstlane_b32 s46, v17
; SI-NEXT: v_mov_b32_e32 v17, s19
; SI-NEXT: v_readfirstlane_b32 s47, v18
; SI-NEXT: v_mov_b32_e32 v18, s20
-; SI-NEXT: v_writelane_b32 v30, s52, 12
+; SI-NEXT: v_writelane_b32 v30, s54, 12
; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
; SI-NEXT: v_readfirstlane_b32 s45, v17
; SI-NEXT: v_mov_b32_e32 v17, s22
; SI-NEXT: v_readfirstlane_b32 s42, v18
; SI-NEXT: v_mov_b32_e32 v18, s23
-; SI-NEXT: v_writelane_b32 v30, s53, 13
+; SI-NEXT: v_writelane_b32 v30, s55, 13
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
; SI-NEXT: v_readfirstlane_b32 s40, v17
; SI-NEXT: v_mov_b32_e32 v17, s25
; SI-NEXT: v_readfirstlane_b32 s41, v18
; SI-NEXT: v_mov_b32_e32 v18, s26
-; SI-NEXT: v_writelane_b32 v30, s54, 14
+; SI-NEXT: v_writelane_b32 v30, s64, 14
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
; SI-NEXT: v_readfirstlane_b32 s25, v17
@@ -3802,7 +3802,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s22, v18
; SI-NEXT: v_mov_b32_e32 v18, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_writelane_b32 v30, s55, 15
+; SI-NEXT: v_writelane_b32 v30, s30, 15
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v17
; SI-NEXT: v_readfirstlane_b32 s21, v18
@@ -3823,7 +3823,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v15
-; SI-NEXT: v_writelane_b32 v30, s64, 16
+; SI-NEXT: v_writelane_b32 v30, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s34, s5, 16
@@ -4009,6 +4009,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s46
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v30, 15
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s44
@@ -4039,23 +4040,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v27, s7
; SI-NEXT: v_mov_b32_e32 v28, s4
; SI-NEXT: v_mov_b32_e32 v29, s5
-; SI-NEXT: v_readlane_b32 s64, v30, 16
-; SI-NEXT: v_readlane_b32 s55, v30, 15
-; SI-NEXT: v_readlane_b32 s54, v30, 14
-; SI-NEXT: v_readlane_b32 s53, v30, 13
-; SI-NEXT: v_readlane_b32 s52, v30, 12
-; SI-NEXT: v_readlane_b32 s51, v30, 11
-; SI-NEXT: v_readlane_b32 s50, v30, 10
-; SI-NEXT: v_readlane_b32 s49, v30, 9
-; SI-NEXT: v_readlane_b32 s48, v30, 8
-; SI-NEXT: v_readlane_b32 s39, v30, 7
-; SI-NEXT: v_readlane_b32 s38, v30, 6
-; SI-NEXT: v_readlane_b32 s37, v30, 5
-; SI-NEXT: v_readlane_b32 s36, v30, 4
-; SI-NEXT: v_readlane_b32 s35, v30, 3
-; SI-NEXT: v_readlane_b32 s34, v30, 2
-; SI-NEXT: v_readlane_b32 s31, v30, 1
-; SI-NEXT: v_readlane_b32 s30, v30, 0
+; SI-NEXT: v_readlane_b32 s31, v30, 16
+; SI-NEXT: v_readlane_b32 s64, v30, 14
+; SI-NEXT: v_readlane_b32 s55, v30, 13
+; SI-NEXT: v_readlane_b32 s54, v30, 12
+; SI-NEXT: v_readlane_b32 s53, v30, 11
+; SI-NEXT: v_readlane_b32 s52, v30, 10
+; SI-NEXT: v_readlane_b32 s51, v30, 9
+; SI-NEXT: v_readlane_b32 s50, v30, 8
+; SI-NEXT: v_readlane_b32 s49, v30, 7
+; SI-NEXT: v_readlane_b32 s48, v30, 6
+; SI-NEXT: v_readlane_b32 s39, v30, 5
+; SI-NEXT: v_readlane_b32 s38, v30, 4
+; SI-NEXT: v_readlane_b32 s37, v30, 3
+; SI-NEXT: v_readlane_b32 s36, v30, 2
+; SI-NEXT: v_readlane_b32 s35, v30, 1
+; SI-NEXT: v_readlane_b32 s34, v30, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -4100,31 +4100,31 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v30, s30, 0
-; VI-NEXT: v_writelane_b32 v30, s31, 1
+; VI-NEXT: v_writelane_b32 v30, s34, 0
+; VI-NEXT: v_writelane_b32 v30, s35, 1
; VI-NEXT: v_mov_b32_e32 v17, s16
; VI-NEXT: v_mov_b32_e32 v18, s17
-; VI-NEXT: v_writelane_b32 v30, s34, 2
+; VI-NEXT: v_writelane_b32 v30, s36, 2
; VI-NEXT: v_mov_b32_e32 v19, s18
; VI-NEXT: v_readfirstlane_b32 s56, v17
; VI-NEXT: v_mov_b32_e32 v17, s19
; VI-NEXT: v_readfirstlane_b32 s47, v18
; VI-NEXT: v_mov_b32_e32 v18, s20
-; VI-NEXT: v_writelane_b32 v30, s35, 3
+; VI-NEXT: v_writelane_b32 v30, s37, 3
; VI-NEXT: v_readfirstlane_b32 s46, v19
; VI-NEXT: v_mov_b32_e32 v19, s21
; VI-NEXT: v_readfirstlane_b32 s45, v17
; VI-NEXT: v_mov_b32_e32 v17, s22
; VI-NEXT: v_readfirstlane_b32 s44, v18
; VI-NEXT: v_mov_b32_e32 v18, s23
-; VI-NEXT: v_writelane_b32 v30, s36, 4
+; VI-NEXT: v_writelane_b32 v30, s38, 4
; VI-NEXT: v_readfirstlane_b32 s43, v19
; VI-NEXT: v_mov_b32_e32 v19, s24
; VI-NEXT: v_readfirstlane_b32 s42, v17
; VI-NEXT: v_mov_b32_e32 v17, s25
; VI-NEXT: v_readfirstlane_b32 s41, v18
; VI-NEXT: v_mov_b32_e32 v18, s26
-; VI-NEXT: v_writelane_b32 v30, s37, 5
+; VI-NEXT: v_writelane_b32 v30, s39, 5
; VI-NEXT: v_readfirstlane_b32 s40, v19
; VI-NEXT: v_mov_b32_e32 v19, s27
; VI-NEXT: v_readfirstlane_b32 s26, v17
@@ -4132,7 +4132,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s25, v18
; VI-NEXT: v_mov_b32_e32 v18, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; VI-NEXT: v_writelane_b32 v30, s38, 6
+; VI-NEXT: v_writelane_b32 v30, s30, 6
; VI-NEXT: v_readfirstlane_b32 s24, v19
; VI-NEXT: v_readfirstlane_b32 s23, v17
; VI-NEXT: v_readfirstlane_b32 s22, v18
@@ -4153,7 +4153,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s6, v14
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s7, v15
-; VI-NEXT: v_writelane_b32 v30, s39, 7
+; VI-NEXT: v_writelane_b32 v30, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB13_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s27, s7, 16
@@ -4339,6 +4339,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: s_or_b32 s8, s8, s29
; VI-NEXT: s_or_b32 s6, s6, s28
; VI-NEXT: s_or_b32 s7, s7, s27
+; VI-NEXT: v_readlane_b32 s30, v30, 6
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s46
@@ -4369,14 +4370,13 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v27, s8
; VI-NEXT: v_mov_b32_e32 v28, s6
; VI-NEXT: v_mov_b32_e32 v29, s7
-; VI-NEXT: v_readlane_b32 s39, v30, 7
-; VI-NEXT: v_readlane_b32 s38, v30, 6
-; VI-NEXT: v_readlane_b32 s37, v30, 5
-; VI-NEXT: v_readlane_b32 s36, v30, 4
-; VI-NEXT: v_readlane_b32 s35, v30, 3
-; VI-NEXT: v_readlane_b32 s34, v30, 2
-; VI-NEXT: v_readlane_b32 s31, v30, 1
-; VI-NEXT: v_readlane_b32 s30, v30, 0
+; VI-NEXT: v_readlane_b32 s31, v30, 7
+; VI-NEXT: v_readlane_b32 s39, v30, 5
+; VI-NEXT: v_readlane_b32 s38, v30, 4
+; VI-NEXT: v_readlane_b32 s37, v30, 3
+; VI-NEXT: v_readlane_b32 s36, v30, 2
+; VI-NEXT: v_readlane_b32 s35, v30, 1
+; VI-NEXT: v_readlane_b32 s34, v30, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -4434,14 +4434,14 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v17, s22
; GFX9-NEXT: v_readfirstlane_b32 s10, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s23
-; GFX9-NEXT: v_writelane_b32 v30, s30, 0
+; GFX9-NEXT: v_writelane_b32 v30, s34, 0
; GFX9-NEXT: v_readfirstlane_b32 s11, v19
; GFX9-NEXT: v_mov_b32_e32 v19, s24
; GFX9-NEXT: v_readfirstlane_b32 s12, v17
; GFX9-NEXT: v_mov_b32_e32 v17, s25
; GFX9-NEXT: v_readfirstlane_b32 s13, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s26
-; GFX9-NEXT: v_writelane_b32 v30, s31, 1
+; GFX9-NEXT: v_writelane_b32 v30, s35, 1
; GFX9-NEXT: v_readfirstlane_b32 s14, v19
; GFX9-NEXT: v_mov_b32_e32 v19, s27
; GFX9-NEXT: v_readfirstlane_b32 s15, v17
@@ -4449,7 +4449,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s16, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_writelane_b32 v30, s34, 2
+; GFX9-NEXT: v_writelane_b32 v30, s30, 2
; GFX9-NEXT: v_readfirstlane_b32 s17, v19
; GFX9-NEXT: v_readfirstlane_b32 s18, v17
; GFX9-NEXT: v_readfirstlane_b32 s19, v18
@@ -4470,7 +4470,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s44, v14
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s45, v15
-; GFX9-NEXT: v_writelane_b32 v30, s35, 3
+; GFX9-NEXT: v_writelane_b32 v30, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB13_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s45, 16
@@ -4596,6 +4596,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46
+; GFX9-NEXT: v_readlane_b32 s30, v30, 2
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
@@ -4626,10 +4627,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v27, s41
; GFX9-NEXT: v_mov_b32_e32 v28, s42
; GFX9-NEXT: v_mov_b32_e32 v29, s43
-; GFX9-NEXT: v_readlane_b32 s35, v30, 3
-; GFX9-NEXT: v_readlane_b32 s34, v30, 2
-; GFX9-NEXT: v_readlane_b32 s31, v30, 1
-; GFX9-NEXT: v_readlane_b32 s30, v30, 0
+; GFX9-NEXT: v_readlane_b32 s31, v30, 3
+; GFX9-NEXT: v_readlane_b32 s35, v30, 1
+; GFX9-NEXT: v_readlane_b32 s34, v30, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -4905,7 +4905,7 @@ end:
ret <60 x i16> %phi
}
-define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
+define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v30i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6250,7 +6250,7 @@ end:
ret <30 x i32> %phi
}
-define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v30i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7369,7 +7369,7 @@ end:
ret <30 x i32> %phi
}
-define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
+define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8277,7 +8277,7 @@ end:
ret <60 x half> %phi
}
-define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30i32_to_v60f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8285,40 +8285,40 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v30, s30, 0
-; SI-NEXT: v_writelane_b32 v30, s31, 1
-; SI-NEXT: v_writelane_b32 v30, s34, 2
-; SI-NEXT: v_writelane_b32 v30, s35, 3
-; SI-NEXT: v_writelane_b32 v30, s36, 4
-; SI-NEXT: v_writelane_b32 v30, s37, 5
-; SI-NEXT: v_writelane_b32 v30, s38, 6
-; SI-NEXT: v_writelane_b32 v30, s39, 7
-; SI-NEXT: v_writelane_b32 v30, s48, 8
-; SI-NEXT: v_writelane_b32 v30, s49, 9
-; SI-NEXT: v_writelane_b32 v30, s50, 10
+; SI-NEXT: v_writelane_b32 v30, s34, 0
+; SI-NEXT: v_writelane_b32 v30, s35, 1
+; SI-NEXT: v_writelane_b32 v30, s36, 2
+; SI-NEXT: v_writelane_b32 v30, s37, 3
+; SI-NEXT: v_writelane_b32 v30, s38, 4
+; SI-NEXT: v_writelane_b32 v30, s39, 5
+; SI-NEXT: v_writelane_b32 v30, s48, 6
+; SI-NEXT: v_writelane_b32 v30, s49, 7
+; SI-NEXT: v_writelane_b32 v30, s50, 8
+; SI-NEXT: v_writelane_b32 v30, s51, 9
+; SI-NEXT: v_writelane_b32 v30, s52, 10
; SI-NEXT: v_mov_b32_e32 v17, s16
; SI-NEXT: v_mov_b32_e32 v18, s17
-; SI-NEXT: v_writelane_b32 v30, s51, 11
+; SI-NEXT: v_writelane_b32 v30, s53, 11
; SI-NEXT: v_mov_b32_e32 v19, s18
; SI-NEXT: v_readfirstlane_b32 s46, v17
; SI-NEXT: v_mov_b32_e32 v17, s19
; SI-NEXT: v_readfirstlane_b32 s47, v18
; SI-NEXT: v_mov_b32_e32 v18, s20
-; SI-NEXT: v_writelane_b32 v30, s52, 12
+; SI-NEXT: v_writelane_b32 v30, s54, 12
; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
; SI-NEXT: v_readfirstlane_b32 s45, v17
; SI-NEXT: v_mov_b32_e32 v17, s22
; SI-NEXT: v_readfirstlane_b32 s42, v18
; SI-NEXT: v_mov_b32_e32 v18, s23
-; SI-NEXT: v_writelane_b32 v30, s53, 13
+; SI-NEXT: v_writelane_b32 v30, s55, 13
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
; SI-NEXT: v_readfirstlane_b32 s40, v17
; SI-NEXT: v_mov_b32_e32 v17, s25
; SI-NEXT: v_readfirstlane_b32 s41, v18
; SI-NEXT: v_mov_b32_e32 v18, s26
-; SI-NEXT: v_writelane_b32 v30, s54, 14
+; SI-NEXT: v_writelane_b32 v30, s64, 14
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
; SI-NEXT: v_readfirstlane_b32 s25, v17
@@ -8326,7 +8326,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s22, v18
; SI-NEXT: v_mov_b32_e32 v18, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_writelane_b32 v30, s55, 15
+; SI-NEXT: v_writelane_b32 v30, s30, 15
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v17
; SI-NEXT: v_readfirstlane_b32 s21, v18
@@ -8347,7 +8347,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v15
-; SI-NEXT: v_writelane_b32 v30, s64, 16
+; SI-NEXT: v_writelane_b32 v30, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s34, s5, 16
@@ -8533,6 +8533,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s46
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v30, 15
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s44
@@ -8563,23 +8564,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v27, s7
; SI-NEXT: v_mov_b32_e32 v28, s4
; SI-NEXT: v_mov_b32_e32 v29, s5
-; SI-NEXT: v_readlane_b32 s64, v30, 16
-; SI-NEXT: v_readlane_b32 s55, v30, 15
-; SI-NEXT: v_readlane_b32 s54, v30, 14
-; SI-NEXT: v_readlane_b32 s53, v30, 13
-; SI-NEXT: v_readlane_b32 s52, v30, 12
-; SI-NEXT: v_readlane_b32 s51, v30, 11
-; SI-NEXT: v_readlane_b32 s50, v30, 10
-; SI-NEXT: v_readlane_b32 s49, v30, 9
-; SI-NEXT: v_readlane_b32 s48, v30, 8
-; SI-NEXT: v_readlane_b32 s39, v30, 7
-; SI-NEXT: v_readlane_b32 s38, v30, 6
-; SI-NEXT: v_readlane_b32 s37, v30, 5
-; SI-NEXT: v_readlane_b32 s36, v30, 4
-; SI-NEXT: v_readlane_b32 s35, v30, 3
-; SI-NEXT: v_readlane_b32 s34, v30, 2
-; SI-NEXT: v_readlane_b32 s31, v30, 1
-; SI-NEXT: v_readlane_b32 s30, v30, 0
+; SI-NEXT: v_readlane_b32 s31, v30, 16
+; SI-NEXT: v_readlane_b32 s64, v30, 14
+; SI-NEXT: v_readlane_b32 s55, v30, 13
+; SI-NEXT: v_readlane_b32 s54, v30, 12
+; SI-NEXT: v_readlane_b32 s53, v30, 11
+; SI-NEXT: v_readlane_b32 s52, v30, 10
+; SI-NEXT: v_readlane_b32 s51, v30, 9
+; SI-NEXT: v_readlane_b32 s50, v30, 8
+; SI-NEXT: v_readlane_b32 s49, v30, 7
+; SI-NEXT: v_readlane_b32 s48, v30, 6
+; SI-NEXT: v_readlane_b32 s39, v30, 5
+; SI-NEXT: v_readlane_b32 s38, v30, 4
+; SI-NEXT: v_readlane_b32 s37, v30, 3
+; SI-NEXT: v_readlane_b32 s36, v30, 2
+; SI-NEXT: v_readlane_b32 s35, v30, 1
+; SI-NEXT: v_readlane_b32 s34, v30, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -8624,31 +8624,31 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v30, s30, 0
-; VI-NEXT: v_writelane_b32 v30, s31, 1
+; VI-NEXT: v_writelane_b32 v30, s34, 0
+; VI-NEXT: v_writelane_b32 v30, s35, 1
; VI-NEXT: v_mov_b32_e32 v17, s16
; VI-NEXT: v_mov_b32_e32 v18, s17
-; VI-NEXT: v_writelane_b32 v30, s34, 2
+; VI-NEXT: v_writelane_b32 v30, s36, 2
; VI-NEXT: v_mov_b32_e32 v19, s18
; VI-NEXT: v_readfirstlane_b32 s56, v17
; VI-NEXT: v_mov_b32_e32 v17, s19
; VI-NEXT: v_readfirstlane_b32 s47, v18
; VI-NEXT: v_mov_b32_e32 v18, s20
-; VI-NEXT: v_writelane_b32 v30, s35, 3
+; VI-NEXT: v_writelane_b32 v30, s37, 3
; VI-NEXT: v_readfirstlane_b32 s46, v19
; VI-NEXT: v_mov_b32_e32 v19, s21
; VI-NEXT: v_readfirstlane_b32 s45, v17
; VI-NEXT: v_mov_b32_e32 v17, s22
; VI-NEXT: v_readfirstlane_b32 s44, v18
; VI-NEXT: v_mov_b32_e32 v18, s23
-; VI-NEXT: v_writelane_b32 v30, s36, 4
+; VI-NEXT: v_writelane_b32 v30, s38, 4
; VI-NEXT: v_readfirstlane_b32 s43, v19
; VI-NEXT: v_mov_b32_e32 v19, s24
; VI-NEXT: v_readfirstlane_b32 s42, v17
; VI-NEXT: v_mov_b32_e32 v17, s25
; VI-NEXT: v_readfirstlane_b32 s41, v18
; VI-NEXT: v_mov_b32_e32 v18, s26
-; VI-NEXT: v_writelane_b32 v30, s37, 5
+; VI-NEXT: v_writelane_b32 v30, s39, 5
; VI-NEXT: v_readfirstlane_b32 s40, v19
; VI-NEXT: v_mov_b32_e32 v19, s27
; VI-NEXT: v_readfirstlane_b32 s26, v17
@@ -8656,7 +8656,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s25, v18
; VI-NEXT: v_mov_b32_e32 v18, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; VI-NEXT: v_writelane_b32 v30, s38, 6
+; VI-NEXT: v_writelane_b32 v30, s30, 6
; VI-NEXT: v_readfirstlane_b32 s24, v19
; VI-NEXT: v_readfirstlane_b32 s23, v17
; VI-NEXT: v_readfirstlane_b32 s22, v18
@@ -8677,7 +8677,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s6, v14
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s7, v15
-; VI-NEXT: v_writelane_b32 v30, s39, 7
+; VI-NEXT: v_writelane_b32 v30, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB17_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s27, s7, 16
@@ -8863,6 +8863,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: s_or_b32 s8, s8, s29
; VI-NEXT: s_or_b32 s6, s6, s28
; VI-NEXT: s_or_b32 s7, s7, s27
+; VI-NEXT: v_readlane_b32 s30, v30, 6
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s46
@@ -8893,14 +8894,13 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v27, s8
; VI-NEXT: v_mov_b32_e32 v28, s6
; VI-NEXT: v_mov_b32_e32 v29, s7
-; VI-NEXT: v_readlane_b32 s39, v30, 7
-; VI-NEXT: v_readlane_b32 s38, v30, 6
-; VI-NEXT: v_readlane_b32 s37, v30, 5
-; VI-NEXT: v_readlane_b32 s36, v30, 4
-; VI-NEXT: v_readlane_b32 s35, v30, 3
-; VI-NEXT: v_readlane_b32 s34, v30, 2
-; VI-NEXT: v_readlane_b32 s31, v30, 1
-; VI-NEXT: v_readlane_b32 s30, v30, 0
+; VI-NEXT: v_readlane_b32 s31, v30, 7
+; VI-NEXT: v_readlane_b32 s39, v30, 5
+; VI-NEXT: v_readlane_b32 s38, v30, 4
+; VI-NEXT: v_readlane_b32 s37, v30, 3
+; VI-NEXT: v_readlane_b32 s36, v30, 2
+; VI-NEXT: v_readlane_b32 s35, v30, 1
+; VI-NEXT: v_readlane_b32 s34, v30, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -8958,14 +8958,14 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v17, s22
; GFX9-NEXT: v_readfirstlane_b32 s10, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s23
-; GFX9-NEXT: v_writelane_b32 v30, s30, 0
+; GFX9-NEXT: v_writelane_b32 v30, s34, 0
; GFX9-NEXT: v_readfirstlane_b32 s11, v19
; GFX9-NEXT: v_mov_b32_e32 v19, s24
; GFX9-NEXT: v_readfirstlane_b32 s12, v17
; GFX9-NEXT: v_mov_b32_e32 v17, s25
; GFX9-NEXT: v_readfirstlane_b32 s13, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s26
-; GFX9-NEXT: v_writelane_b32 v30, s31, 1
+; GFX9-NEXT: v_writelane_b32 v30, s35, 1
; GFX9-NEXT: v_readfirstlane_b32 s14, v19
; GFX9-NEXT: v_mov_b32_e32 v19, s27
; GFX9-NEXT: v_readfirstlane_b32 s15, v17
@@ -8973,7 +8973,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s16, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_writelane_b32 v30, s34, 2
+; GFX9-NEXT: v_writelane_b32 v30, s30, 2
; GFX9-NEXT: v_readfirstlane_b32 s17, v19
; GFX9-NEXT: v_readfirstlane_b32 s18, v17
; GFX9-NEXT: v_readfirstlane_b32 s19, v18
@@ -8994,7 +8994,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s44, v14
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s45, v15
-; GFX9-NEXT: v_writelane_b32 v30, s35, 3
+; GFX9-NEXT: v_writelane_b32 v30, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB17_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s45, 16
@@ -9120,6 +9120,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46
+; GFX9-NEXT: v_readlane_b32 s30, v30, 2
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
@@ -9150,10 +9151,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v27, s41
; GFX9-NEXT: v_mov_b32_e32 v28, s42
; GFX9-NEXT: v_mov_b32_e32 v29, s43
-; GFX9-NEXT: v_readlane_b32 s35, v30, 3
-; GFX9-NEXT: v_readlane_b32 s34, v30, 2
-; GFX9-NEXT: v_readlane_b32 s31, v30, 1
-; GFX9-NEXT: v_readlane_b32 s30, v30, 0
+; GFX9-NEXT: v_readlane_b32 s31, v30, 3
+; GFX9-NEXT: v_readlane_b32 s35, v30, 1
+; GFX9-NEXT: v_readlane_b32 s34, v30, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -9429,7 +9429,7 @@ end:
ret <60 x half> %phi
}
-define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
+define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v30i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10918,7 +10918,7 @@ end:
ret <30 x i32> %phi
}
-define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v30i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12093,7 +12093,7 @@ end:
ret <30 x i32> %phi
}
-define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) {
+define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v15i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12268,7 +12268,7 @@ end:
ret <15 x i64> %phi
}
-define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v15i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12558,7 +12558,7 @@ end:
ret <15 x i64> %phi
}
-define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) {
+define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v30f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12756,7 +12756,7 @@ end:
ret <30 x float> %phi
}
-define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v30f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13069,7 +13069,7 @@ end:
ret <30 x float> %phi
}
-define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) {
+define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v15f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13244,7 +13244,7 @@ end:
ret <15 x double> %phi
}
-define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v15f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13534,7 +13534,7 @@ end:
ret <15 x double> %phi
}
-define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) {
+define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v30f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13664,7 +13664,7 @@ end:
ret <30 x float> %phi
}
-define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v30f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13912,7 +13912,7 @@ end:
ret <30 x float> %phi
}
-define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
+define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14790,7 +14790,7 @@ end:
ret <60 x i16> %phi
}
-define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v60i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15996,7 +15996,7 @@ end:
ret <60 x i16> %phi
}
-define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
+define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v30f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17341,7 +17341,7 @@ end:
ret <30 x float> %phi
}
-define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v30f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18460,7 +18460,7 @@ end:
ret <30 x float> %phi
}
-define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
+define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -19338,7 +19338,7 @@ end:
ret <60 x half> %phi
}
-define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v30f32_to_v60f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20544,7 +20544,7 @@ end:
ret <60 x half> %phi
}
-define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
+define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v30f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22033,7 +22033,7 @@ end:
ret <30 x float> %phi
}
-define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v30f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23208,7 +23208,7 @@ end:
ret <30 x float> %phi
}
-define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) {
+define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v15f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23406,7 +23406,7 @@ end:
ret <15 x double> %phi
}
-define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v15f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23719,7 +23719,7 @@ end:
ret <15 x double> %phi
}
-define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) {
+define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v15i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -23849,7 +23849,7 @@ end:
ret <15 x i64> %phi
}
-define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v15i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24097,7 +24097,7 @@ end:
ret <15 x i64> %phi
}
-define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
+define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25021,7 +25021,7 @@ end:
ret <60 x i16> %phi
}
-define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v60i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25029,40 +25029,40 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v30, s30, 0
-; SI-NEXT: v_writelane_b32 v30, s31, 1
-; SI-NEXT: v_writelane_b32 v30, s34, 2
-; SI-NEXT: v_writelane_b32 v30, s35, 3
-; SI-NEXT: v_writelane_b32 v30, s36, 4
-; SI-NEXT: v_writelane_b32 v30, s37, 5
-; SI-NEXT: v_writelane_b32 v30, s38, 6
-; SI-NEXT: v_writelane_b32 v30, s39, 7
-; SI-NEXT: v_writelane_b32 v30, s48, 8
-; SI-NEXT: v_writelane_b32 v30, s49, 9
-; SI-NEXT: v_writelane_b32 v30, s50, 10
+; SI-NEXT: v_writelane_b32 v30, s34, 0
+; SI-NEXT: v_writelane_b32 v30, s35, 1
+; SI-NEXT: v_writelane_b32 v30, s36, 2
+; SI-NEXT: v_writelane_b32 v30, s37, 3
+; SI-NEXT: v_writelane_b32 v30, s38, 4
+; SI-NEXT: v_writelane_b32 v30, s39, 5
+; SI-NEXT: v_writelane_b32 v30, s48, 6
+; SI-NEXT: v_writelane_b32 v30, s49, 7
+; SI-NEXT: v_writelane_b32 v30, s50, 8
+; SI-NEXT: v_writelane_b32 v30, s51, 9
+; SI-NEXT: v_writelane_b32 v30, s52, 10
; SI-NEXT: v_mov_b32_e32 v17, s16
; SI-NEXT: v_mov_b32_e32 v18, s17
-; SI-NEXT: v_writelane_b32 v30, s51, 11
+; SI-NEXT: v_writelane_b32 v30, s53, 11
; SI-NEXT: v_mov_b32_e32 v19, s18
; SI-NEXT: v_readfirstlane_b32 s46, v17
; SI-NEXT: v_mov_b32_e32 v17, s19
; SI-NEXT: v_readfirstlane_b32 s47, v18
; SI-NEXT: v_mov_b32_e32 v18, s20
-; SI-NEXT: v_writelane_b32 v30, s52, 12
+; SI-NEXT: v_writelane_b32 v30, s54, 12
; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
; SI-NEXT: v_readfirstlane_b32 s45, v17
; SI-NEXT: v_mov_b32_e32 v17, s22
; SI-NEXT: v_readfirstlane_b32 s42, v18
; SI-NEXT: v_mov_b32_e32 v18, s23
-; SI-NEXT: v_writelane_b32 v30, s53, 13
+; SI-NEXT: v_writelane_b32 v30, s55, 13
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
; SI-NEXT: v_readfirstlane_b32 s40, v17
; SI-NEXT: v_mov_b32_e32 v17, s25
; SI-NEXT: v_readfirstlane_b32 s41, v18
; SI-NEXT: v_mov_b32_e32 v18, s26
-; SI-NEXT: v_writelane_b32 v30, s54, 14
+; SI-NEXT: v_writelane_b32 v30, s64, 14
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
; SI-NEXT: v_readfirstlane_b32 s25, v17
@@ -25070,7 +25070,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s22, v18
; SI-NEXT: v_mov_b32_e32 v18, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_writelane_b32 v30, s55, 15
+; SI-NEXT: v_writelane_b32 v30, s30, 15
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v17
; SI-NEXT: v_readfirstlane_b32 s21, v18
@@ -25091,7 +25091,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v15
-; SI-NEXT: v_writelane_b32 v30, s64, 16
+; SI-NEXT: v_writelane_b32 v30, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s34, s5, 16
@@ -25277,6 +25277,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: s_or_b32 s9, s9, s46
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v30, 15
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s44
@@ -25307,23 +25308,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v27, s7
; SI-NEXT: v_mov_b32_e32 v28, s4
; SI-NEXT: v_mov_b32_e32 v29, s5
-; SI-NEXT: v_readlane_b32 s64, v30, 16
-; SI-NEXT: v_readlane_b32 s55, v30, 15
-; SI-NEXT: v_readlane_b32 s54, v30, 14
-; SI-NEXT: v_readlane_b32 s53, v30, 13
-; SI-NEXT: v_readlane_b32 s52, v30, 12
-; SI-NEXT: v_readlane_b32 s51, v30, 11
-; SI-NEXT: v_readlane_b32 s50, v30, 10
-; SI-NEXT: v_readlane_b32 s49, v30, 9
-; SI-NEXT: v_readlane_b32 s48, v30, 8
-; SI-NEXT: v_readlane_b32 s39, v30, 7
-; SI-NEXT: v_readlane_b32 s38, v30, 6
-; SI-NEXT: v_readlane_b32 s37, v30, 5
-; SI-NEXT: v_readlane_b32 s36, v30, 4
-; SI-NEXT: v_readlane_b32 s35, v30, 3
-; SI-NEXT: v_readlane_b32 s34, v30, 2
-; SI-NEXT: v_readlane_b32 s31, v30, 1
-; SI-NEXT: v_readlane_b32 s30, v30, 0
+; SI-NEXT: v_readlane_b32 s31, v30, 16
+; SI-NEXT: v_readlane_b32 s64, v30, 14
+; SI-NEXT: v_readlane_b32 s55, v30, 13
+; SI-NEXT: v_readlane_b32 s54, v30, 12
+; SI-NEXT: v_readlane_b32 s53, v30, 11
+; SI-NEXT: v_readlane_b32 s52, v30, 10
+; SI-NEXT: v_readlane_b32 s51, v30, 9
+; SI-NEXT: v_readlane_b32 s50, v30, 8
+; SI-NEXT: v_readlane_b32 s49, v30, 7
+; SI-NEXT: v_readlane_b32 s48, v30, 6
+; SI-NEXT: v_readlane_b32 s39, v30, 5
+; SI-NEXT: v_readlane_b32 s38, v30, 4
+; SI-NEXT: v_readlane_b32 s37, v30, 3
+; SI-NEXT: v_readlane_b32 s36, v30, 2
+; SI-NEXT: v_readlane_b32 s35, v30, 1
+; SI-NEXT: v_readlane_b32 s34, v30, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -25368,31 +25368,31 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v30, s30, 0
-; VI-NEXT: v_writelane_b32 v30, s31, 1
+; VI-NEXT: v_writelane_b32 v30, s34, 0
+; VI-NEXT: v_writelane_b32 v30, s35, 1
; VI-NEXT: v_mov_b32_e32 v17, s16
; VI-NEXT: v_mov_b32_e32 v18, s17
-; VI-NEXT: v_writelane_b32 v30, s34, 2
+; VI-NEXT: v_writelane_b32 v30, s36, 2
; VI-NEXT: v_mov_b32_e32 v19, s18
; VI-NEXT: v_readfirstlane_b32 s56, v17
; VI-NEXT: v_mov_b32_e32 v17, s19
; VI-NEXT: v_readfirstlane_b32 s47, v18
; VI-NEXT: v_mov_b32_e32 v18, s20
-; VI-NEXT: v_writelane_b32 v30, s35, 3
+; VI-NEXT: v_writelane_b32 v30, s37, 3
; VI-NEXT: v_readfirstlane_b32 s46, v19
; VI-NEXT: v_mov_b32_e32 v19, s21
; VI-NEXT: v_readfirstlane_b32 s45, v17
; VI-NEXT: v_mov_b32_e32 v17, s22
; VI-NEXT: v_readfirstlane_b32 s44, v18
; VI-NEXT: v_mov_b32_e32 v18, s23
-; VI-NEXT: v_writelane_b32 v30, s36, 4
+; VI-NEXT: v_writelane_b32 v30, s38, 4
; VI-NEXT: v_readfirstlane_b32 s43, v19
; VI-NEXT: v_mov_b32_e32 v19, s24
; VI-NEXT: v_readfirstlane_b32 s42, v17
; VI-NEXT: v_mov_b32_e32 v17, s25
; VI-NEXT: v_readfirstlane_b32 s41, v18
; VI-NEXT: v_mov_b32_e32 v18, s26
-; VI-NEXT: v_writelane_b32 v30, s37, 5
+; VI-NEXT: v_writelane_b32 v30, s39, 5
; VI-NEXT: v_readfirstlane_b32 s40, v19
; VI-NEXT: v_mov_b32_e32 v19, s27
; VI-NEXT: v_readfirstlane_b32 s26, v17
@@ -25400,7 +25400,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s25, v18
; VI-NEXT: v_mov_b32_e32 v18, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; VI-NEXT: v_writelane_b32 v30, s38, 6
+; VI-NEXT: v_writelane_b32 v30, s30, 6
; VI-NEXT: v_readfirstlane_b32 s24, v19
; VI-NEXT: v_readfirstlane_b32 s23, v17
; VI-NEXT: v_readfirstlane_b32 s22, v18
@@ -25421,7 +25421,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s6, v14
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s7, v15
-; VI-NEXT: v_writelane_b32 v30, s39, 7
+; VI-NEXT: v_writelane_b32 v30, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB41_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s27, s7, 16
@@ -25607,6 +25607,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: s_or_b32 s8, s8, s29
; VI-NEXT: s_or_b32 s6, s6, s28
; VI-NEXT: s_or_b32 s7, s7, s27
+; VI-NEXT: v_readlane_b32 s30, v30, 6
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s46
@@ -25637,14 +25638,13 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v27, s8
; VI-NEXT: v_mov_b32_e32 v28, s6
; VI-NEXT: v_mov_b32_e32 v29, s7
-; VI-NEXT: v_readlane_b32 s39, v30, 7
-; VI-NEXT: v_readlane_b32 s38, v30, 6
-; VI-NEXT: v_readlane_b32 s37, v30, 5
-; VI-NEXT: v_readlane_b32 s36, v30, 4
-; VI-NEXT: v_readlane_b32 s35, v30, 3
-; VI-NEXT: v_readlane_b32 s34, v30, 2
-; VI-NEXT: v_readlane_b32 s31, v30, 1
-; VI-NEXT: v_readlane_b32 s30, v30, 0
+; VI-NEXT: v_readlane_b32 s31, v30, 7
+; VI-NEXT: v_readlane_b32 s39, v30, 5
+; VI-NEXT: v_readlane_b32 s38, v30, 4
+; VI-NEXT: v_readlane_b32 s37, v30, 3
+; VI-NEXT: v_readlane_b32 s36, v30, 2
+; VI-NEXT: v_readlane_b32 s35, v30, 1
+; VI-NEXT: v_readlane_b32 s34, v30, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -25702,14 +25702,14 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v17, s22
; GFX9-NEXT: v_readfirstlane_b32 s10, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s23
-; GFX9-NEXT: v_writelane_b32 v30, s30, 0
+; GFX9-NEXT: v_writelane_b32 v30, s34, 0
; GFX9-NEXT: v_readfirstlane_b32 s11, v19
; GFX9-NEXT: v_mov_b32_e32 v19, s24
; GFX9-NEXT: v_readfirstlane_b32 s12, v17
; GFX9-NEXT: v_mov_b32_e32 v17, s25
; GFX9-NEXT: v_readfirstlane_b32 s13, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s26
-; GFX9-NEXT: v_writelane_b32 v30, s31, 1
+; GFX9-NEXT: v_writelane_b32 v30, s35, 1
; GFX9-NEXT: v_readfirstlane_b32 s14, v19
; GFX9-NEXT: v_mov_b32_e32 v19, s27
; GFX9-NEXT: v_readfirstlane_b32 s15, v17
@@ -25717,7 +25717,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s16, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_writelane_b32 v30, s34, 2
+; GFX9-NEXT: v_writelane_b32 v30, s30, 2
; GFX9-NEXT: v_readfirstlane_b32 s17, v19
; GFX9-NEXT: v_readfirstlane_b32 s18, v17
; GFX9-NEXT: v_readfirstlane_b32 s19, v18
@@ -25738,7 +25738,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_readfirstlane_b32 s44, v14
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s45, v15
-; GFX9-NEXT: v_writelane_b32 v30, s35, 3
+; GFX9-NEXT: v_writelane_b32 v30, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB41_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s45, 16
@@ -25864,6 +25864,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46
+; GFX9-NEXT: v_readlane_b32 s30, v30, 2
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
@@ -25894,10 +25895,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v27, s41
; GFX9-NEXT: v_mov_b32_e32 v28, s42
; GFX9-NEXT: v_mov_b32_e32 v29, s43
-; GFX9-NEXT: v_readlane_b32 s35, v30, 3
-; GFX9-NEXT: v_readlane_b32 s34, v30, 2
-; GFX9-NEXT: v_readlane_b32 s31, v30, 1
-; GFX9-NEXT: v_readlane_b32 s30, v30, 0
+; GFX9-NEXT: v_readlane_b32 s31, v30, 3
+; GFX9-NEXT: v_readlane_b32 s35, v30, 1
+; GFX9-NEXT: v_readlane_b32 s34, v30, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -26173,7 +26173,7 @@ end:
ret <60 x i16> %phi
}
-define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
+define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v15i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27518,7 +27518,7 @@ end:
ret <15 x i64> %phi
}
-define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v15i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28637,7 +28637,7 @@ end:
ret <15 x i64> %phi
}
-define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
+define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29561,7 +29561,7 @@ end:
ret <60 x half> %phi
}
-define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15i64_to_v60f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29569,40 +29569,40 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v30, s30, 0
-; SI-NEXT: v_writelane_b32 v30, s31, 1
-; SI-NEXT: v_writelane_b32 v30, s34, 2
-; SI-NEXT: v_writelane_b32 v30, s35, 3
-; SI-NEXT: v_writelane_b32 v30, s36, 4
-; SI-NEXT: v_writelane_b32 v30, s37, 5
-; SI-NEXT: v_writelane_b32 v30, s38, 6
-; SI-NEXT: v_writelane_b32 v30, s39, 7
-; SI-NEXT: v_writelane_b32 v30, s48, 8
-; SI-NEXT: v_writelane_b32 v30, s49, 9
-; SI-NEXT: v_writelane_b32 v30, s50, 10
+; SI-NEXT: v_writelane_b32 v30, s34, 0
+; SI-NEXT: v_writelane_b32 v30, s35, 1
+; SI-NEXT: v_writelane_b32 v30, s36, 2
+; SI-NEXT: v_writelane_b32 v30, s37, 3
+; SI-NEXT: v_writelane_b32 v30, s38, 4
+; SI-NEXT: v_writelane_b32 v30, s39, 5
+; SI-NEXT: v_writelane_b32 v30, s48, 6
+; SI-NEXT: v_writelane_b32 v30, s49, 7
+; SI-NEXT: v_writelane_b32 v30, s50, 8
+; SI-NEXT: v_writelane_b32 v30, s51, 9
+; SI-NEXT: v_writelane_b32 v30, s52, 10
; SI-NEXT: v_mov_b32_e32 v17, s16
; SI-NEXT: v_mov_b32_e32 v18, s17
-; SI-NEXT: v_writelane_b32 v30, s51, 11
+; SI-NEXT: v_writelane_b32 v30, s53, 11
; SI-NEXT: v_mov_b32_e32 v19, s18
; SI-NEXT: v_readfirstlane_b32 s46, v17
; SI-NEXT: v_mov_b32_e32 v17, s19
; SI-NEXT: v_readfirstlane_b32 s47, v18
; SI-NEXT: v_mov_b32_e32 v18, s20
-; SI-NEXT: v_writelane_b32 v30, s52, 12
+; SI-NEXT: v_writelane_b32 v30, s54, 12
; SI-NEXT: v_readfirstlane_b32 s44, v19
; SI-NEXT: v_mov_b32_e32 v19, s21
; SI-NEXT: v_readfirstlane_b32 s45, v17
; SI-NEXT: v_mov_b32_e32 v17, s22
; SI-NEXT: v_readfirstlane_b32 s42, v18
; SI-NEXT: v_mov_b32_e32 v18, s23
-; SI-NEXT: v_writelane_b32 v30, s53, 13
+; SI-NEXT: v_writelane_b32 v30, s55, 13
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_mov_b32_e32 v19, s24
; SI-NEXT: v_readfirstlane_b32 s40, v17
; SI-NEXT: v_mov_b32_e32 v17, s25
; SI-NEXT: v_readfirstlane_b32 s41, v18
; SI-NEXT: v_mov_b32_e32 v18, s26
-; SI-NEXT: v_writelane_b32 v30, s54, 14
+; SI-NEXT: v_writelane_b32 v30, s64, 14
; SI-NEXT: v_readfirstlane_b32 s24, v19
; SI-NEXT: v_mov_b32_e32 v19, s27
; SI-NEXT: v_readfirstlane_b32 s25, v17
@@ -29610,7 +29610,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s22, v18
; SI-NEXT: v_mov_b32_e32 v18, s29
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_writelane_b32 v30, s55, 15
+; SI-NEXT: v_writelane_b32 v30, s30, 15
; SI-NEXT: v_readfirstlane_b32 s23, v19
; SI-NEXT: v_readfirstlane_b32 s20, v17
; SI-NEXT: v_readfirstlane_b32 s21, v18
@@ -29631,7 +29631,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s4, v14
; SI-NEXT: s_and_b64 s[26:27], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v15
-; SI-NEXT: v_writelane_b32 v30, s64, 16
+; SI-NEXT: v_writelane_b32 v30, s31, 16
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s34, s5, 16
@@ -29817,6 +29817,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: s_or_b32 s9, s9, s46
; SI-NEXT: s_or_b32 s7, s7, s28
; SI-NEXT: s_or_b32 s5, s5, s26
+; SI-NEXT: v_readlane_b32 s30, v30, 15
; SI-NEXT: v_mov_b32_e32 v0, s27
; SI-NEXT: v_mov_b32_e32 v1, s29
; SI-NEXT: v_mov_b32_e32 v2, s44
@@ -29847,23 +29848,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v27, s7
; SI-NEXT: v_mov_b32_e32 v28, s4
; SI-NEXT: v_mov_b32_e32 v29, s5
-; SI-NEXT: v_readlane_b32 s64, v30, 16
-; SI-NEXT: v_readlane_b32 s55, v30, 15
-; SI-NEXT: v_readlane_b32 s54, v30, 14
-; SI-NEXT: v_readlane_b32 s53, v30, 13
-; SI-NEXT: v_readlane_b32 s52, v30, 12
-; SI-NEXT: v_readlane_b32 s51, v30, 11
-; SI-NEXT: v_readlane_b32 s50, v30, 10
-; SI-NEXT: v_readlane_b32 s49, v30, 9
-; SI-NEXT: v_readlane_b32 s48, v30, 8
-; SI-NEXT: v_readlane_b32 s39, v30, 7
-; SI-NEXT: v_readlane_b32 s38, v30, 6
-; SI-NEXT: v_readlane_b32 s37, v30, 5
-; SI-NEXT: v_readlane_b32 s36, v30, 4
-; SI-NEXT: v_readlane_b32 s35, v30, 3
-; SI-NEXT: v_readlane_b32 s34, v30, 2
-; SI-NEXT: v_readlane_b32 s31, v30, 1
-; SI-NEXT: v_readlane_b32 s30, v30, 0
+; SI-NEXT: v_readlane_b32 s31, v30, 16
+; SI-NEXT: v_readlane_b32 s64, v30, 14
+; SI-NEXT: v_readlane_b32 s55, v30, 13
+; SI-NEXT: v_readlane_b32 s54, v30, 12
+; SI-NEXT: v_readlane_b32 s53, v30, 11
+; SI-NEXT: v_readlane_b32 s52, v30, 10
+; SI-NEXT: v_readlane_b32 s51, v30, 9
+; SI-NEXT: v_readlane_b32 s50, v30, 8
+; SI-NEXT: v_readlane_b32 s49, v30, 7
+; SI-NEXT: v_readlane_b32 s48, v30, 6
+; SI-NEXT: v_readlane_b32 s39, v30, 5
+; SI-NEXT: v_readlane_b32 s38, v30, 4
+; SI-NEXT: v_readlane_b32 s37, v30, 3
+; SI-NEXT: v_readlane_b32 s36, v30, 2
+; SI-NEXT: v_readlane_b32 s35, v30, 1
+; SI-NEXT: v_readlane_b32 s34, v30, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
@@ -29908,31 +29908,31 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v30, s30, 0
-; VI-NEXT: v_writelane_b32 v30, s31, 1
+; VI-NEXT: v_writelane_b32 v30, s34, 0
+; VI-NEXT: v_writelane_b32 v30, s35, 1
; VI-NEXT: v_mov_b32_e32 v17, s16
; VI-NEXT: v_mov_b32_e32 v18, s17
-; VI-NEXT: v_writelane_b32 v30, s34, 2
+; VI-NEXT: v_writelane_b32 v30, s36, 2
; VI-NEXT: v_mov_b32_e32 v19, s18
; VI-NEXT: v_readfirstlane_b32 s56, v17
; VI-NEXT: v_mov_b32_e32 v17, s19
; VI-NEXT: v_readfirstlane_b32 s47, v18
; VI-NEXT: v_mov_b32_e32 v18, s20
-; VI-NEXT: v_writelane_b32 v30, s35, 3
+; VI-NEXT: v_writelane_b32 v30, s37, 3
; VI-NEXT: v_readfirstlane_b32 s46, v19
; VI-NEXT: v_mov_b32_e32 v19, s21
; VI-NEXT: v_readfirstlane_b32 s45, v17
; VI-NEXT: v_mov_b32_e32 v17, s22
; VI-NEXT: v_readfirstlane_b32 s44, v18
; VI-NEXT: v_mov_b32_e32 v18, s23
-; VI-NEXT: v_writelane_b32 v30, s36, 4
+; VI-NEXT: v_writelane_b32 v30, s38, 4
; VI-NEXT: v_readfirstlane_b32 s43, v19
; VI-NEXT: v_mov_b32_e32 v19, s24
; VI-NEXT: v_readfirstlane_b32 s42, v17
; VI-NEXT: v_mov_b32_e32 v17, s25
; VI-NEXT: v_readfirstlane_b32 s41, v18
; VI-NEXT: v_mov_b32_e32 v18, s26
-; VI-NEXT: v_writelane_b32 v30, s37, 5
+; VI-NEXT: v_writelane_b32 v30, s39, 5
; VI-NEXT: v_readfirstlane_b32 s40, v19
; VI-NEXT: v_mov_b32_e32 v19, s27
; VI-NEXT: v_readfirstlane_b32 s26, v17
@@ -29940,7 +29940,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s25, v18
; VI-NEXT: v_mov_b32_e32 v18, s29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; VI-NEXT: v_writelane_b32 v30, s38, 6
+; VI-NEXT: v_writelane_b32 v30, s30, 6
; VI-NEXT: v_readfirstlane_b32 s24, v19
; VI-NEXT: v_readfirstlane_b32 s23, v17
; VI-NEXT: v_readfirstlane_b32 s22, v18
@@ -29961,7 +29961,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s6, v14
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s7, v15
-; VI-NEXT: v_writelane_b32 v30, s39, 7
+; VI-NEXT: v_writelane_b32 v30, s31, 7
; VI-NEXT: s_cbranch_scc0 .LBB45_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s27, s7, 16
@@ -30147,6 +30147,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: s_or_b32 s8, s8, s29
; VI-NEXT: s_or_b32 s6, s6, s28
; VI-NEXT: s_or_b32 s7, s7, s27
+; VI-NEXT: v_readlane_b32 s30, v30, 6
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s46
@@ -30177,14 +30178,13 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v27, s8
; VI-NEXT: v_mov_b32_e32 v28, s6
; VI-NEXT: v_mov_b32_e32 v29, s7
-; VI-NEXT: v_readlane_b32 s39, v30, 7
-; VI-NEXT: v_readlane_b32 s38, v30, 6
-; VI-NEXT: v_readlane_b32 s37, v30, 5
-; VI-NEXT: v_readlane_b32 s36, v30, 4
-; VI-NEXT: v_readlane_b32 s35, v30, 3
-; VI-NEXT: v_readlane_b32 s34, v30, 2
-; VI-NEXT: v_readlane_b32 s31, v30, 1
-; VI-NEXT: v_readlane_b32 s30, v30, 0
+; VI-NEXT: v_readlane_b32 s31, v30, 7
+; VI-NEXT: v_readlane_b32 s39, v30, 5
+; VI-NEXT: v_readlane_b32 s38, v30, 4
+; VI-NEXT: v_readlane_b32 s37, v30, 3
+; VI-NEXT: v_readlane_b32 s36, v30, 2
+; VI-NEXT: v_readlane_b32 s35, v30, 1
+; VI-NEXT: v_readlane_b32 s34, v30, 0
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
@@ -30242,14 +30242,14 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v17, s22
; GFX9-NEXT: v_readfirstlane_b32 s10, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s23
-; GFX9-NEXT: v_writelane_b32 v30, s30, 0
+; GFX9-NEXT: v_writelane_b32 v30, s34, 0
; GFX9-NEXT: v_readfirstlane_b32 s11, v19
; GFX9-NEXT: v_mov_b32_e32 v19, s24
; GFX9-NEXT: v_readfirstlane_b32 s12, v17
; GFX9-NEXT: v_mov_b32_e32 v17, s25
; GFX9-NEXT: v_readfirstlane_b32 s13, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s26
-; GFX9-NEXT: v_writelane_b32 v30, s31, 1
+; GFX9-NEXT: v_writelane_b32 v30, s35, 1
; GFX9-NEXT: v_readfirstlane_b32 s14, v19
; GFX9-NEXT: v_mov_b32_e32 v19, s27
; GFX9-NEXT: v_readfirstlane_b32 s15, v17
@@ -30257,7 +30257,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s16, v18
; GFX9-NEXT: v_mov_b32_e32 v18, s29
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_writelane_b32 v30, s34, 2
+; GFX9-NEXT: v_writelane_b32 v30, s30, 2
; GFX9-NEXT: v_readfirstlane_b32 s17, v19
; GFX9-NEXT: v_readfirstlane_b32 s18, v17
; GFX9-NEXT: v_readfirstlane_b32 s19, v18
@@ -30278,7 +30278,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: v_readfirstlane_b32 s44, v14
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s45, v15
-; GFX9-NEXT: v_writelane_b32 v30, s35, 3
+; GFX9-NEXT: v_writelane_b32 v30, s31, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB45_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_lshr_b32 s46, s45, 16
@@ -30404,6 +30404,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56
; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47
; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46
+; GFX9-NEXT: v_readlane_b32 s30, v30, 2
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
@@ -30434,10 +30435,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v27, s41
; GFX9-NEXT: v_mov_b32_e32 v28, s42
; GFX9-NEXT: v_mov_b32_e32 v29, s43
-; GFX9-NEXT: v_readlane_b32 s35, v30, 3
-; GFX9-NEXT: v_readlane_b32 s34, v30, 2
-; GFX9-NEXT: v_readlane_b32 s31, v30, 1
-; GFX9-NEXT: v_readlane_b32 s30, v30, 0
+; GFX9-NEXT: v_readlane_b32 s31, v30, 3
+; GFX9-NEXT: v_readlane_b32 s35, v30, 1
+; GFX9-NEXT: v_readlane_b32 s34, v30, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -30713,7 +30713,7 @@ end:
ret <60 x half> %phi
}
-define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
+define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v15i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32202,7 +32202,7 @@ end:
ret <15 x i64> %phi
}
-define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v15i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -33377,7 +33377,7 @@ end:
ret <15 x i64> %phi
}
-define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
+define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34210,7 +34210,7 @@ end:
ret <60 x i16> %phi
}
-define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v60i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -35371,7 +35371,7 @@ end:
ret <60 x i16> %phi
}
-define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
+define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v15f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36716,7 +36716,7 @@ end:
ret <15 x double> %phi
}
-define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v15f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -37835,7 +37835,7 @@ end:
ret <15 x double> %phi
}
-define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
+define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38668,7 +38668,7 @@ end:
ret <60 x half> %phi
}
-define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v15f64_to_v60f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -39829,7 +39829,7 @@ end:
ret <60 x half> %phi
}
-define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
+define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v15f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41318,7 +41318,7 @@ end:
ret <15 x double> %phi
}
-define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v15f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42493,7 +42493,7 @@ end:
ret <15 x double> %phi
}
-define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
+define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v60f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43966,7 +43966,7 @@ end:
ret <60 x half> %phi
}
-define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i32 inreg %b) {
+define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60i16_to_v60f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43975,61 +43975,61 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v30, s30, 0
-; SI-NEXT: v_writelane_b32 v30, s31, 1
-; SI-NEXT: v_writelane_b32 v30, s34, 2
-; SI-NEXT: v_writelane_b32 v30, s35, 3
-; SI-NEXT: v_writelane_b32 v30, s36, 4
-; SI-NEXT: v_writelane_b32 v30, s37, 5
-; SI-NEXT: v_writelane_b32 v30, s38, 6
-; SI-NEXT: v_writelane_b32 v30, s39, 7
-; SI-NEXT: v_writelane_b32 v30, s48, 8
-; SI-NEXT: v_writelane_b32 v30, s49, 9
-; SI-NEXT: v_writelane_b32 v30, s50, 10
-; SI-NEXT: v_writelane_b32 v30, s51, 11
-; SI-NEXT: v_writelane_b32 v30, s52, 12
-; SI-NEXT: v_writelane_b32 v30, s53, 13
-; SI-NEXT: v_writelane_b32 v30, s54, 14
-; SI-NEXT: v_writelane_b32 v30, s55, 15
-; SI-NEXT: v_writelane_b32 v30, s64, 16
-; SI-NEXT: v_writelane_b32 v30, s65, 17
-; SI-NEXT: v_writelane_b32 v30, s66, 18
-; SI-NEXT: v_writelane_b32 v30, s67, 19
-; SI-NEXT: v_writelane_b32 v30, s68, 20
-; SI-NEXT: v_writelane_b32 v30, s69, 21
+; SI-NEXT: v_writelane_b32 v30, s34, 0
+; SI-NEXT: v_writelane_b32 v30, s35, 1
+; SI-NEXT: v_writelane_b32 v30, s36, 2
+; SI-NEXT: v_writelane_b32 v30, s37, 3
+; SI-NEXT: v_writelane_b32 v30, s38, 4
+; SI-NEXT: v_writelane_b32 v30, s39, 5
+; SI-NEXT: v_writelane_b32 v30, s48, 6
+; SI-NEXT: v_writelane_b32 v30, s49, 7
+; SI-NEXT: v_writelane_b32 v30, s50, 8
+; SI-NEXT: v_writelane_b32 v30, s51, 9
+; SI-NEXT: v_writelane_b32 v30, s52, 10
+; SI-NEXT: v_writelane_b32 v30, s53, 11
+; SI-NEXT: v_writelane_b32 v30, s54, 12
+; SI-NEXT: v_writelane_b32 v30, s55, 13
+; SI-NEXT: v_writelane_b32 v30, s64, 14
+; SI-NEXT: v_writelane_b32 v30, s65, 15
+; SI-NEXT: v_writelane_b32 v30, s66, 16
+; SI-NEXT: v_writelane_b32 v30, s67, 17
+; SI-NEXT: v_writelane_b32 v30, s68, 18
+; SI-NEXT: v_writelane_b32 v30, s69, 19
+; SI-NEXT: v_writelane_b32 v30, s70, 20
+; SI-NEXT: v_writelane_b32 v30, s71, 21
; SI-NEXT: v_readfirstlane_b32 s4, v5
; SI-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane
-; SI-NEXT: v_writelane_b32 v30, s70, 22
+; SI-NEXT: v_writelane_b32 v30, s80, 22
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v31, s4, 0
; SI-NEXT: v_readfirstlane_b32 s4, v4
-; SI-NEXT: v_writelane_b32 v30, s71, 23
+; SI-NEXT: v_writelane_b32 v30, s81, 23
; SI-NEXT: v_writelane_b32 v31, s4, 1
; SI-NEXT: v_readfirstlane_b32 s4, v3
-; SI-NEXT: v_writelane_b32 v30, s80, 24
+; SI-NEXT: v_writelane_b32 v30, s82, 24
; SI-NEXT: v_writelane_b32 v31, s4, 2
-; SI-NEXT: v_writelane_b32 v30, s81, 25
+; SI-NEXT: v_writelane_b32 v30, s83, 25
; SI-NEXT: v_writelane_b32 v31, s29, 3
; SI-NEXT: s_lshr_b32 s4, s28, 16
-; SI-NEXT: v_writelane_b32 v30, s82, 26
+; SI-NEXT: v_writelane_b32 v30, s84, 26
; SI-NEXT: v_writelane_b32 v31, s4, 4
-; SI-NEXT: v_writelane_b32 v30, s83, 27
+; SI-NEXT: v_writelane_b32 v30, s85, 27
; SI-NEXT: v_writelane_b32 v31, s27, 5
-; SI-NEXT: v_writelane_b32 v30, s84, 28
+; SI-NEXT: v_writelane_b32 v30, s86, 28
; SI-NEXT: v_writelane_b32 v31, s25, 6
-; SI-NEXT: v_writelane_b32 v30, s85, 29
+; SI-NEXT: v_writelane_b32 v30, s87, 29
; SI-NEXT: v_writelane_b32 v31, s23, 7
-; SI-NEXT: v_writelane_b32 v30, s86, 30
+; SI-NEXT: v_writelane_b32 v30, s96, 30
; SI-NEXT: v_writelane_b32 v31, s21, 8
; SI-NEXT: s_lshr_b32 s4, s20, 16
-; SI-NEXT: v_writelane_b32 v30, s87, 31
+; SI-NEXT: v_writelane_b32 v30, s97, 31
; SI-NEXT: v_writelane_b32 v31, s4, 9
-; SI-NEXT: v_writelane_b32 v30, s96, 32
+; SI-NEXT: v_writelane_b32 v30, s98, 32
; SI-NEXT: v_writelane_b32 v31, s16, 10
-; SI-NEXT: v_writelane_b32 v30, s97, 33
+; SI-NEXT: v_writelane_b32 v30, s99, 33
; SI-NEXT: s_mov_b32 s59, s20
; SI-NEXT: v_writelane_b32 v31, s18, 11
-; SI-NEXT: v_writelane_b32 v30, s98, 34
+; SI-NEXT: v_writelane_b32 v30, s30, 34
; SI-NEXT: s_mov_b32 s98, s22
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14
@@ -44071,7 +44071,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: s_lshr_b32 s94, s16, 16
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; SI-NEXT: v_writelane_b32 v31, s59, 12
-; SI-NEXT: v_writelane_b32 v30, s99, 35
+; SI-NEXT: v_writelane_b32 v30, s31, 35
; SI-NEXT: v_readfirstlane_b32 s29, v2
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s37, v1
@@ -44569,6 +44569,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_lshl_b32 s44, s86, 16
; SI-NEXT: s_or_b32 s5, s5, s44
+; SI-NEXT: v_readlane_b32 s30, v30, 34
; SI-NEXT: v_readlane_b32 s45, v31, 20
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
@@ -44600,42 +44601,41 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v27, s7
; SI-NEXT: v_mov_b32_e32 v28, s4
; SI-NEXT: v_mov_b32_e32 v29, s5
-; SI-NEXT: v_readlane_b32 s99, v30, 35
-; SI-NEXT: v_readlane_b32 s98, v30, 34
-; SI-NEXT: v_readlane_b32 s97, v30, 33
-; SI-NEXT: v_readlane_b32 s96, v30, 32
-; SI-NEXT: v_readlane_b32 s87, v30, 31
-; SI-NEXT: v_readlane_b32 s86, v30, 30
-; SI-NEXT: v_readlane_b32 s85, v30, 29
-; SI-NEXT: v_readlane_b32 s84, v30, 28
-; SI-NEXT: v_readlane_b32 s83, v30, 27
-; SI-NEXT: v_readlane_b32 s82, v30, 26
-; SI-NEXT: v_readlane_b32 s81, v30, 25
-; SI-NEXT: v_readlane_b32 s80, v30, 24
-; SI-NEXT: v_readlane_b32 s71, v30, 23
-; SI-NEXT: v_readlane_b32 s70, v30, 22
-; SI-NEXT: v_readlane_b32 s69, v30, 21
-; SI-NEXT: v_readlane_b32 s68, v30, 20
-; SI-NEXT: v_readlane_b32 s67, v30, 19
-; SI-NEXT: v_readlane_b32 s66, v30, 18
-; SI-NEXT: v_readlane_b32 s65, v30, 17
-; SI-NEXT: v_readlane_b32 s64, v30, 16
-; SI-NEXT: v_readlane_b32 s55, v30, 15
-; SI-NEXT: v_readlane_b32 s54, v30, 14
-; SI-NEXT: v_readlane_b32 s53, v30, 13
-; SI-NEXT: v_readlane_b32 s52, v30, 12
-; SI-NEXT: v_readlane_b32 s51, v30, 11
-; SI-NEXT: v_readlane_b32 s50, v30, 10
-; SI-NEXT: v_readlane_b32 s49, v30, 9
-; SI-NEXT: v_readlane_b32 s48, v30, 8
-; SI-NEXT: v_readlane_b32 s39, v30, 7
-; SI-NEXT: v_readlane_b32 s38, v30, 6
-; SI-NEXT: v_readlane_b32 s37, v30, 5
-; SI-NEXT: v_readlane_b32 s36, v30, 4
-; SI-NEXT: v_readlane_b32 s35, v30, 3
-; SI-NEXT: v_readlane_b32 s34, v30, 2
-; SI-NEXT: v_readlane_b32 s31, v30, 1
-; SI-NEXT: v_readlane_b32 s30, v30, 0
+; SI-NEXT: v_readlane_b32 s31, v30, 35
+; SI-NEXT: v_readlane_b32 s99, v30, 33
+; SI-NEXT: v_readlane_b32 s98, v30, 32
+; SI-NEXT: v_readlane_b32 s97, v30, 31
+; SI-NEXT: v_readlane_b32 s96, v30, 30
+; SI-NEXT: v_readlane_b32 s87, v30, 29
+; SI-NEXT: v_readlane_b32 s86, v30, 28
+; SI-NEXT: v_readlane_b32 s85, v30, 27
+; SI-NEXT: v_readlane_b32 s84, v30, 26
+; SI-NEXT: v_readlane_b32 s83, v30, 25
+; SI-NEXT: v_readlane_b32 s82, v30, 24
+; SI-NEXT: v_readlane_b32 s81, v30, 23
+; SI-NEXT: v_readlane_b32 s80, v30, 22
+; SI-NEXT: v_readlane_b32 s71, v30, 21
+; SI-NEXT: v_readlane_b32 s70, v30, 20
+; SI-NEXT: v_readlane_b32 s69, v30, 19
+; SI-NEXT: v_readlane_b32 s68, v30, 18
+; SI-NEXT: v_readlane_b32 s67, v30, 17
+; SI-NEXT: v_readlane_b32 s66, v30, 16
+; SI-NEXT: v_readlane_b32 s65, v30, 15
+; SI-NEXT: v_readlane_b32 s64, v30, 14
+; SI-NEXT: v_readlane_b32 s55, v30, 13
+; SI-NEXT: v_readlane_b32 s54, v30, 12
+; SI-NEXT: v_readlane_b32 s53, v30, 11
+; SI-NEXT: v_readlane_b32 s52, v30, 10
+; SI-NEXT: v_readlane_b32 s51, v30, 9
+; SI-NEXT: v_readlane_b32 s50, v30, 8
+; SI-NEXT: v_readlane_b32 s49, v30, 7
+; SI-NEXT: v_readlane_b32 s48, v30, 6
+; SI-NEXT: v_readlane_b32 s39, v30, 5
+; SI-NEXT: v_readlane_b32 s38, v30, 4
+; SI-NEXT: v_readlane_b32 s37, v30, 3
+; SI-NEXT: v_readlane_b32 s36, v30, 2
+; SI-NEXT: v_readlane_b32 s35, v30, 1
+; SI-NEXT: v_readlane_b32 s34, v30, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -45630,7 +45630,7 @@ end:
ret <60 x half> %phi
}
-define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
+define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v60i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46657,7 +46657,7 @@ end:
ret <60 x i16> %phi
}
-define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i32 inreg %b) {
+define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v60f16_to_v60i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -48241,3 +48241,5 @@ end:
%phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <60 x i16> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 687bea385a266..18eeff9df6180 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) {
+define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -84,7 +84,7 @@ end:
ret <3 x float> %phi
}
-define inreg <3 x float> @bitcast_v3i32_to_v3f32_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v3i32_to_v3f32_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v3f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -179,7 +179,7 @@ end:
ret <3 x float> %phi
}
-define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) {
+define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v3i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -256,7 +256,7 @@ end:
ret <3 x i32> %phi
}
-define inreg <3 x i32> @bitcast_v3f32_to_v3i32_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v3f32_to_v3i32_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v3i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -354,7 +354,7 @@ end:
ret <3 x i32> %phi
}
-define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) {
+define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v12i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -625,7 +625,7 @@ end:
ret <12 x i8> %phi
}
-define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v12i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -858,7 +858,7 @@ end:
ret <12 x i8> %phi
}
-define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
+define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v3i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1284,7 +1284,7 @@ end:
ret <3 x i32> %phi
}
-define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v3i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1571,7 +1571,7 @@ end:
ret <3 x i32> %phi
}
-define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v6bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1684,7 +1684,7 @@ end:
ret <6 x bfloat> %phi
}
-define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v6bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1806,7 +1806,7 @@ end:
ret <6 x bfloat> %phi
}
-define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
+define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v3i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2153,7 +2153,7 @@ end:
ret <3 x i32> %phi
}
-define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v3i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2535,7 +2535,7 @@ end:
ret <3 x i32> %phi
}
-define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) {
+define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v6f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2633,7 +2633,7 @@ end:
ret <6 x half> %phi
}
-define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v6f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2746,7 +2746,7 @@ end:
ret <6 x half> %phi
}
-define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) {
+define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v3i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2884,7 +2884,7 @@ end:
ret <3 x i32> %phi
}
-define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v3i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3030,7 +3030,7 @@ end:
ret <3 x i32> %phi
}
-define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) {
+define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v6i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3128,7 +3128,7 @@ end:
ret <6 x i16> %phi
}
-define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3i32_to_v6i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3241,7 +3241,7 @@ end:
ret <6 x i16> %phi
}
-define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) {
+define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v3i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3366,7 +3366,7 @@ end:
ret <3 x i32> %phi
}
-define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v3i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3499,7 +3499,7 @@ end:
ret <3 x i32> %phi
}
-define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) {
+define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v12i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3768,7 +3768,7 @@ end:
ret <12 x i8> %phi
}
-define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v12i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4021,7 +4021,7 @@ end:
ret <12 x i8> %phi
}
-define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
+define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4447,7 +4447,7 @@ end:
ret <3 x float> %phi
}
-define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v3f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4734,7 +4734,7 @@ end:
ret <3 x float> %phi
}
-define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v6bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4846,7 +4846,7 @@ end:
ret <6 x bfloat> %phi
}
-define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v6bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4980,7 +4980,7 @@ end:
ret <6 x bfloat> %phi
}
-define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
+define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5327,7 +5327,7 @@ end:
ret <3 x float> %phi
}
-define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v3f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5709,7 +5709,7 @@ end:
ret <3 x float> %phi
}
-define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) {
+define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v6f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5806,7 +5806,7 @@ end:
ret <6 x half> %phi
}
-define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v6f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5928,7 +5928,7 @@ end:
ret <6 x half> %phi
}
-define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) {
+define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6066,7 +6066,7 @@ end:
ret <3 x float> %phi
}
-define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v3f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6212,7 +6212,7 @@ end:
ret <3 x float> %phi
}
-define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) {
+define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v6i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6309,7 +6309,7 @@ end:
ret <6 x i16> %phi
}
-define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v3f32_to_v6i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6431,7 +6431,7 @@ end:
ret <6 x i16> %phi
}
-define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) {
+define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6556,7 +6556,7 @@ end:
ret <3 x float> %phi
}
-define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v3f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -6689,7 +6689,7 @@ end:
ret <3 x float> %phi
}
-define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v6bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7134,7 +7134,7 @@ end:
ret <6 x bfloat> %phi
}
-define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v6bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7438,7 +7438,7 @@ end:
ret <6 x bfloat> %phi
}
-define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
+define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v12i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -7958,7 +7958,7 @@ end:
ret <12 x i8> %phi
}
-define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v12i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8519,7 +8519,7 @@ end:
ret <12 x i8> %phi
}
-define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
+define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v6f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8962,7 +8962,7 @@ end:
ret <6 x half> %phi
}
-define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v6f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9269,7 +9269,7 @@ end:
ret <6 x half> %phi
}
-define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) {
+define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v12i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9586,7 +9586,7 @@ end:
ret <12 x i8> %phi
}
-define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v12i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -9882,7 +9882,7 @@ end:
ret <12 x i8> %phi
}
-define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
+define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v6i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10325,7 +10325,7 @@ end:
ret <6 x i16> %phi
}
-define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v12i8_to_v6i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10632,7 +10632,7 @@ end:
ret <6 x i16> %phi
}
-define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) {
+define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v12i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -10942,7 +10942,7 @@ end:
ret <12 x i8> %phi
}
-define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v12i8_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11219,7 +11219,7 @@ end:
ret <12 x i8> %phi
}
-define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) {
+define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v6f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11580,7 +11580,7 @@ end:
ret <6 x half> %phi
}
-define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v6f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -11978,7 +11978,7 @@ end:
ret <6 x half> %phi
}
-define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v6bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12122,7 +12122,7 @@ end:
ret <6 x bfloat> %phi
}
-define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v6bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12288,7 +12288,7 @@ end:
ret <6 x bfloat> %phi
}
-define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
+define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v6i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12641,7 +12641,7 @@ end:
ret <6 x i16> %phi
}
-define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6bf16_to_v6i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13018,7 +13018,7 @@ end:
ret <6 x i16> %phi
}
-define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) {
+define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v6bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13149,7 +13149,7 @@ end:
ret <6 x bfloat> %phi
}
-define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v6bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13300,7 +13300,7 @@ end:
ret <6 x bfloat> %phi
}
-define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) {
+define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v6i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13422,7 +13422,7 @@ end:
ret <6 x i16> %phi
}
-define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 inreg %b) {
+define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6f16_to_v6i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13574,7 +13574,7 @@ end:
ret <6 x i16> %phi
}
-define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) {
+define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v6f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13707,7 +13707,7 @@ end:
ret <6 x half> %phi
}
-define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 inreg %b) {
+define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 inreg %b) #0 {
; SI-LABEL: bitcast_v6i16_to_v6f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -13856,3 +13856,5 @@ end:
%phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <6 x half> %phi
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
index 2b48cf0f41c88..7e9f825e298c7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll
@@ -5,7 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: bitcast_i8ptr_v16i8ptr:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -66,3 +66,5 @@ entry:
store <16 x i8> %0, ptr addrspace(1) %out
ret void
}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index afe0971088bc1..ff31915e9080f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -119,32 +119,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v2
; CHECK-NEXT: s_mov_b32 s50, s15
@@ -178,21 +178,21 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s30, v43, 12
; CHECK-NEXT: v_or_b32_e32 v1, v1, v2
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s31, v43, 13
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -259,30 +259,30 @@ define double @test_powr_fast_f64(double %x, double %y) {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v42, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v3
; CHECK-NEXT: v_mov_b32_e32 v40, v2
@@ -314,20 +314,20 @@ define double @test_powr_fast_f64(double %x, double %y) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s30, v43, 12
+; CHECK-NEXT: v_readlane_b32 s31, v43, 13
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -401,32 +401,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v2
; CHECK-NEXT: s_mov_b32 s50, s15
@@ -460,21 +460,21 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s30, v43, 12
; CHECK-NEXT: v_or_b32_e32 v1, v1, v2
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s31, v43, 13
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -543,30 +543,30 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v42, s16, 14
-; CHECK-NEXT: v_writelane_b32 v42, s30, 0
-; CHECK-NEXT: v_writelane_b32 v42, s31, 1
-; CHECK-NEXT: v_writelane_b32 v42, s34, 2
-; CHECK-NEXT: v_writelane_b32 v42, s35, 3
-; CHECK-NEXT: v_writelane_b32 v42, s36, 4
-; CHECK-NEXT: v_writelane_b32 v42, s37, 5
-; CHECK-NEXT: v_writelane_b32 v42, s38, 6
-; CHECK-NEXT: v_writelane_b32 v42, s39, 7
+; CHECK-NEXT: v_writelane_b32 v42, s34, 0
+; CHECK-NEXT: v_writelane_b32 v42, s35, 1
+; CHECK-NEXT: v_writelane_b32 v42, s36, 2
+; CHECK-NEXT: v_writelane_b32 v42, s37, 3
+; CHECK-NEXT: v_writelane_b32 v42, s38, 4
+; CHECK-NEXT: v_writelane_b32 v42, s39, 5
; CHECK-NEXT: s_addk_i32 s32, 0x400
-; CHECK-NEXT: v_writelane_b32 v42, s48, 8
-; CHECK-NEXT: v_writelane_b32 v42, s49, 9
+; CHECK-NEXT: v_writelane_b32 v42, s48, 6
+; CHECK-NEXT: v_writelane_b32 v42, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v42, s50, 8
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v42, s50, 10
-; CHECK-NEXT: v_writelane_b32 v42, s51, 11
-; CHECK-NEXT: v_writelane_b32 v42, s52, 12
+; CHECK-NEXT: v_writelane_b32 v42, s51, 9
+; CHECK-NEXT: v_writelane_b32 v42, s52, 10
+; CHECK-NEXT: v_writelane_b32 v42, s53, 11
+; CHECK-NEXT: v_writelane_b32 v42, s30, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v42, s53, 13
+; CHECK-NEXT: v_writelane_b32 v42, s31, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -597,20 +597,20 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: v_readlane_b32 s53, v42, 13
-; CHECK-NEXT: v_readlane_b32 s52, v42, 12
-; CHECK-NEXT: v_readlane_b32 s51, v42, 11
-; CHECK-NEXT: v_readlane_b32 s50, v42, 10
-; CHECK-NEXT: v_readlane_b32 s49, v42, 9
-; CHECK-NEXT: v_readlane_b32 s48, v42, 8
-; CHECK-NEXT: v_readlane_b32 s39, v42, 7
-; CHECK-NEXT: v_readlane_b32 s38, v42, 6
-; CHECK-NEXT: v_readlane_b32 s37, v42, 5
-; CHECK-NEXT: v_readlane_b32 s36, v42, 4
-; CHECK-NEXT: v_readlane_b32 s35, v42, 3
-; CHECK-NEXT: v_readlane_b32 s34, v42, 2
-; CHECK-NEXT: v_readlane_b32 s31, v42, 1
-; CHECK-NEXT: v_readlane_b32 s30, v42, 0
+; CHECK-NEXT: v_readlane_b32 s30, v42, 12
+; CHECK-NEXT: v_readlane_b32 s31, v42, 13
+; CHECK-NEXT: v_readlane_b32 s53, v42, 11
+; CHECK-NEXT: v_readlane_b32 s52, v42, 10
+; CHECK-NEXT: v_readlane_b32 s51, v42, 9
+; CHECK-NEXT: v_readlane_b32 s50, v42, 8
+; CHECK-NEXT: v_readlane_b32 s49, v42, 7
+; CHECK-NEXT: v_readlane_b32 s48, v42, 6
+; CHECK-NEXT: v_readlane_b32 s39, v42, 5
+; CHECK-NEXT: v_readlane_b32 s38, v42, 4
+; CHECK-NEXT: v_readlane_b32 s37, v42, 3
+; CHECK-NEXT: v_readlane_b32 s36, v42, 2
+; CHECK-NEXT: v_readlane_b32 s35, v42, 1
+; CHECK-NEXT: v_readlane_b32 s34, v42, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v42, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -684,33 +684,33 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v43, s16, 15
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s54, 12
; CHECK-NEXT: v_mov_b32_e32 v41, v1
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s30, 13
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
-; CHECK-NEXT: v_writelane_b32 v43, s54, 14
+; CHECK-NEXT: v_writelane_b32 v43, s31, 14
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -744,21 +744,21 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: v_readlane_b32 s54, v43, 14
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s30, v43, 13
+; CHECK-NEXT: v_readlane_b32 s31, v43, 14
+; CHECK-NEXT: v_readlane_b32 s54, v43, 12
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 15
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -771,3 +771,4 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
%call = tail call fast double @_Z4powndi(double %x, i32 %y)
ret double %call
}
+
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
index 554f40b5bfdfa..7ea641885a1f1 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -214,8 +214,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX8-NEXT: v_writelane_b32 v3, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX8-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-NEXT: v_readlane_b32 s30, v3, 0
+; GFX8-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -242,8 +242,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8
@@ -270,8 +270,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX9-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -297,8 +297,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX9-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -321,11 +321,12 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
+; GFX942-ARCH-FLAT-NEXT: s_nop 1
; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
@@ -352,8 +353,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index c4a3c70282810..b962014c92a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -3315,8 +3315,8 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: v_readlane_b32 s30, v2, 0
+; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3342,10 +3342,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_writelane_b32 v2, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT: v_readlane_b32 s30, v2, 0
; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v2, 1
-; GFX7-NEXT: v_readlane_b32 s30, v2, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3371,10 +3371,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
-; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3400,10 +3400,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v2, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v2, 0
; GFX900-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v2, 1
-; GFX900-NEXT: v_readlane_b32 s30, v2, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3426,13 +3426,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v4, s30, 0
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_writelane_b32 v4, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v4, 0
; GFX950-NEXT: scratch_store_short v1, v0, off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v4, 1
-; GFX950-NEXT: v_readlane_b32 s30, v4, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -3459,10 +3460,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
-; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3489,10 +3490,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: scratch_store_b16 v1, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
-; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -3519,10 +3521,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v4, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: scratch_store_b16 v1, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -3559,8 +3562,8 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: v_readlane_b32 s30, v2, 0
+; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3586,10 +3589,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_writelane_b32 v2, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT: v_readlane_b32 s30, v2, 0
; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v2, 1
-; GFX7-NEXT: v_readlane_b32 s30, v2, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3615,10 +3618,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
-; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3644,10 +3647,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v2, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v2, 0
; GFX900-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v2, 1
-; GFX900-NEXT: v_readlane_b32 s30, v2, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3670,13 +3673,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v4, s30, 0
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_writelane_b32 v4, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v4, 0
; GFX950-NEXT: scratch_store_dword v1, v0, off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v4, 1
-; GFX950-NEXT: v_readlane_b32 s30, v4, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
@@ -3703,10 +3707,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
-; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3733,10 +3737,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: scratch_store_b32 v1, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
-; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -3763,10 +3768,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v4, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: scratch_store_b32 v1, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v4, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v4, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -3807,8 +3813,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v4, 1
; GCN-NEXT: v_readlane_b32 s30, v4, 0
+; GCN-NEXT: v_readlane_b32 s31, v4, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3836,12 +3842,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v2
+; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
-; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3868,12 +3874,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
-; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3899,12 +3905,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v3, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v3, 0
; GFX900-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v3, 1
-; GFX900-NEXT: v_readlane_b32 s30, v3, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3927,16 +3933,17 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v5, s30, 0
-; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: v_mov_b32_e32 v4, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: scratch_store_short v4, v1, off offset:4 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: scratch_store_dword v4, v0, off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v5, 1
-; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -3963,12 +3970,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -3995,12 +4002,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 v2, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
-; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -4028,13 +4036,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -4082,8 +4091,8 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v3, v7, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v8, 1
; GCN-NEXT: v_readlane_b32 s30, v8, 0
+; GCN-NEXT: v_readlane_b32 s31, v8, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4120,10 +4129,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 2, v2
+; GFX7-NEXT: v_readlane_b32 s30, v6, 0
; GFX7-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v6, 1
-; GFX7-NEXT: v_readlane_b32 s30, v6, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4150,12 +4159,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
-; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4181,12 +4190,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v3, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v3, 0
; GFX900-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v3, 1
-; GFX900-NEXT: v_readlane_b32 s30, v3, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4209,14 +4218,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v5, s30, 0
-; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: v_mov_b32_e32 v4, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v5, 1
-; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -4243,12 +4253,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4275,10 +4285,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
-; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
@@ -4306,10 +4317,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -4371,8 +4383,8 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v16, 1
; GCN-NEXT: v_readlane_b32 s30, v16, 0
+; GCN-NEXT: v_readlane_b32 s31, v16, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4423,10 +4435,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 2, v4
+; GFX7-NEXT: v_readlane_b32 s30, v10, 0
; GFX7-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v10, 1
-; GFX7-NEXT: v_readlane_b32 s30, v10, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4459,12 +4471,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT: v_readlane_b32 s30, v6, 0
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v6, 1
-; GFX8-NEXT: v_readlane_b32 s30, v6, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4490,6 +4502,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v5, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v5, 0
; GFX900-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -4499,7 +4512,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v5, 1
-; GFX900-NEXT: v_readlane_b32 s30, v5, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4522,13 +4534,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v5, s30, 0
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_writelane_b32 v5, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: scratch_store_dwordx4 v4, v[0:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v5, 1
-; GFX950-NEXT: v_readlane_b32 s30, v5, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
@@ -4555,6 +4568,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v5, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v5, 0
; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
@@ -4564,7 +4578,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v5, 1
-; GFX10-NEXT: v_readlane_b32 s30, v5, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4591,10 +4604,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v5, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v5, 0
; GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v5, 1
-; GFX11-NEXT: v_readlane_b32 s30, v5, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
@@ -4621,10 +4635,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v5, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v5, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -4714,8 +4729,8 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v9, v6, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v20, 1
; GCN-NEXT: v_readlane_b32 s30, v20, 0
+; GCN-NEXT: v_readlane_b32 s31, v20, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4794,10 +4809,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 2, v8
+; GFX7-NEXT: v_readlane_b32 s30, v18, 0
; GFX7-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v18, 1
-; GFX7-NEXT: v_readlane_b32 s30, v18, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4842,12 +4857,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8
+; GFX8-NEXT: v_readlane_b32 s30, v10, 0
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v10, 1
-; GFX8-NEXT: v_readlane_b32 s30, v10, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4873,6 +4888,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: v_writelane_b32 v9, s31, 1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: v_readlane_b32 s30, v9, 0
; GFX900-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -4890,7 +4906,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX900-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_readlane_b32 s31, v9, 1
-; GFX900-NEXT: v_readlane_b32 s30, v9, 0
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4913,15 +4928,16 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX950-NEXT: v_writelane_b32 v9, s30, 0
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_writelane_b32 v9, s31, 1
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: v_readlane_b32 s30, v9, 0
; GFX950-NEXT: scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: scratch_store_dwordx4 v8, v[0:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_readlane_b32 s31, v9, 1
-; GFX950-NEXT: v_readlane_b32 s30, v9, 0
; GFX950-NEXT: s_mov_b32 s32, s33
; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX950-NEXT: scratch_load_dword v9, off, s33 ; 4-byte Folded Reload
@@ -4948,6 +4964,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: v_writelane_b32 v9, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s30, v9, 0
; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
@@ -4965,7 +4982,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v9, 1
-; GFX10-NEXT: v_readlane_b32 s30, v9, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -4992,12 +5008,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: v_writelane_b32 v9, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v9, 0
; GFX11-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b128 v8, v[0:3], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v9, 1
-; GFX11-NEXT: v_readlane_b32 s30, v9, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
@@ -5024,13 +5041,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: v_writelane_b32 v9, s31, 1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_readlane_b32 s30, v9, 0
; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v9, 1
-; GFX1250-NEXT: v_readlane_b32 s30, v9, 0
; GFX1250-NEXT: s_mov_b32 s32, s33
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1
@@ -28593,822 +28611,6 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
ret bfloat %op
}
-define bfloat @v_rsq_bf16(bfloat %x) {
-; GCN-LABEL: v_rsq_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT: s_mov_b32 s4, 0xf800000
-; GCN-NEXT: v_mov_b32_e32 v1, 0x260
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: v_sqrt_f32_e32 v2, v0
-; GCN-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; GCN-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; GCN-NEXT: v_fma_f32 v5, -v3, v2, v0
-; GCN-NEXT: v_fma_f32 v6, -v4, v2, v0
-; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; GCN-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GCN-NEXT: v_rcp_f32_e32 v2, v1
-; GCN-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GCN-NEXT: v_fma_f32 v2, v3, v2, v2
-; GCN-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GCN-NEXT: v_mul_f32_e32 v4, v3, v2
-; GCN-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GCN-NEXT: v_fma_f32 v4, v5, v2, v4
-; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GCN-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_rsq_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_mov_b32 s4, 0xf800000
-; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX7-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; GFX7-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX7-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
-; GFX7-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX7-NEXT: v_rcp_f32_e32 v2, v1
-; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2
-; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_rsq_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: s_mov_b32 s4, 0xf800000
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], -1, v1
-; GFX8-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], 1, v1
-; GFX8-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
-; GFX8-NEXT: v_rcp_f32_e32 v3, v1
-; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
-; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
-; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
-; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
-; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-LABEL: v_rsq_bf16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: s_mov_b32 s4, 0xf800000
-; GFX900-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX900-NEXT: s_movk_i32 s6, 0x7fff
-; GFX900-NEXT: v_add_u32_e32 v2, -1, v1
-; GFX900-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX900-NEXT: v_add_u32_e32 v3, 1, v1
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX900-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX900-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX900-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT: v_add3_u32 v1, v1, v0, s6
-; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX900-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX900-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
-; GFX900-NEXT: v_rcp_f32_e32 v3, v1
-; GFX900-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; GFX900-NEXT: v_fma_f32 v3, v4, v3, v3
-; GFX900-NEXT: v_mul_f32_e32 v4, v2, v3
-; GFX900-NEXT: v_fma_f32 v5, -v1, v4, v2
-; GFX900-NEXT: v_fma_f32 v4, v5, v3, v4
-; GFX900-NEXT: v_fma_f32 v1, -v1, v4, v2
-; GFX900-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; GFX900-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX900-NEXT: v_add3_u32 v1, v1, v0, s6
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_rsq_bf16:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: s_mov_b32 s0, 0xf800000
-; GFX950-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_add_u32_e32 v2, -1, v1
-; GFX950-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3
-; GFX950-NEXT: v_add_u32_e32 v3, 1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX950-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GFX950-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX950-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX950-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
-; GFX950-NEXT: v_rcp_f32_e32 v2, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX950-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX950-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX950-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX950-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX950-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX950-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX950-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX950-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_rsq_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v1
-; GFX10-NEXT: v_fma_f32 v4, -v2, v1, v0
-; GFX10-NEXT: v_fma_f32 v5, -v3, v1, v0
-; GFX10-NEXT: v_cmp_ge_f32_e64 s4, 0, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4
-; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
-; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0
-; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
-; GFX10-NEXT: v_rcp_f32_e32 v2, v1
-; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX10-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX10-NEXT: v_fma_f32 v5, -v1, v3, v4
-; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX10-NEXT: v_fma_f32 v1, -v1, v3, v4
-; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11TRUE16-LABEL: v_rsq_bf16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v1
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x4f800000, v1
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX11TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v1
-; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v1
-; GFX11TRUE16-NEXT: v_fma_f32 v4, -v2, v1, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_fma_f32 v5, -v3, v1, v0
-; GFX11TRUE16-NEXT: v_cmp_ge_f32_e64 s0, 0, v4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0
-; GFX11TRUE16-NEXT: v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
-; GFX11TRUE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
-; GFX11TRUE16-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11TRUE16-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fma_f32 v5, -v1, v3, v4
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fma_f32 v1, -v1, v3, v4
-; GFX11TRUE16-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_rsq_bf16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX11FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v1
-; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v3, 1, v1
-; GFX11FAKE16-NEXT: v_fma_f32 v4, -v2, v1, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_fma_f32 v5, -v3, v1, v0
-; GFX11FAKE16-NEXT: v_cmp_ge_f32_e64 s0, 0, v4
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0
-; GFX11FAKE16-NEXT: v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
-; GFX11FAKE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
-; GFX11FAKE16-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11FAKE16-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_fma_f32 v5, -v1, v3, v4
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_fma_f32 v1, -v1, v3, v4
-; GFX11FAKE16-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_rsq_bf16:
-; GFX1250TRUE16: ; %bb.0:
-; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
-; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX1250FAKE16-LABEL: v_rsq_bf16:
-; GFX1250FAKE16: ; %bb.0:
-; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
-; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
- %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
- %rsq = fdiv contract bfloat 1.0, %sqrt
- ret bfloat %rsq
-}
-
-define bfloat @v_neg_rsq_bf16(bfloat %x) {
-; GCN-LABEL: v_neg_rsq_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT: s_mov_b32 s4, 0xf800000
-; GCN-NEXT: v_mov_b32_e32 v1, 0x260
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: v_sqrt_f32_e32 v2, v0
-; GCN-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; GCN-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; GCN-NEXT: v_fma_f32 v5, -v3, v2, v0
-; GCN-NEXT: v_fma_f32 v6, -v4, v2, v0
-; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; GCN-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GCN-NEXT: v_rcp_f32_e32 v2, v1
-; GCN-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GCN-NEXT: v_fma_f32 v2, v3, v2, v2
-; GCN-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GCN-NEXT: v_mul_f32_e32 v4, v3, v2
-; GCN-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GCN-NEXT: v_fma_f32 v4, v5, v2, v4
-; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GCN-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_neg_rsq_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_mov_b32 s4, 0xf800000
-; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX7-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; GFX7-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX7-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
-; GFX7-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX7-NEXT: v_rcp_f32_e32 v2, v1
-; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2
-; GFX7-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_neg_rsq_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: s_mov_b32 s4, 0xf800000
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], -1, v1
-; GFX8-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], 1, v1
-; GFX8-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX8-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
-; GFX8-NEXT: v_rcp_f32_e32 v3, v1
-; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
-; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
-; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
-; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
-; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-LABEL: v_neg_rsq_bf16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: s_mov_b32 s4, 0xf800000
-; GFX900-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX900-NEXT: s_movk_i32 s6, 0x7fff
-; GFX900-NEXT: v_add_u32_e32 v2, -1, v1
-; GFX900-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX900-NEXT: v_add_u32_e32 v3, 1, v1
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX900-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX900-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX900-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT: v_add3_u32 v1, v1, v0, s6
-; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX900-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX900-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
-; GFX900-NEXT: v_rcp_f32_e32 v3, v1
-; GFX900-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; GFX900-NEXT: v_fma_f32 v3, v4, v3, v3
-; GFX900-NEXT: v_mul_f32_e32 v4, v2, v3
-; GFX900-NEXT: v_fma_f32 v5, -v1, v4, v2
-; GFX900-NEXT: v_fma_f32 v4, v5, v3, v4
-; GFX900-NEXT: v_fma_f32 v1, -v1, v4, v2
-; GFX900-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; GFX900-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX900-NEXT: v_add3_u32 v1, v1, v0, s6
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_neg_rsq_bf16:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: s_mov_b32 s0, 0xf800000
-; GFX950-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_add_u32_e32 v2, -1, v1
-; GFX950-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3
-; GFX950-NEXT: v_add_u32_e32 v3, 1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX950-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GFX950-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX950-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX950-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
-; GFX950-NEXT: v_rcp_f32_e32 v2, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX950-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX950-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX950-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX950-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX950-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX950-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX950-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX950-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_neg_rsq_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v1
-; GFX10-NEXT: v_fma_f32 v4, -v2, v1, v0
-; GFX10-NEXT: v_fma_f32 v5, -v3, v1, v0
-; GFX10-NEXT: v_cmp_ge_f32_e64 s4, 0, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4
-; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
-; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, -1.0
-; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, -1.0, v0, -1.0
-; GFX10-NEXT: v_rcp_f32_e32 v2, v1
-; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX10-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX10-NEXT: v_fma_f32 v5, -v1, v3, v4
-; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX10-NEXT: v_fma_f32 v1, -v1, v3, v4
-; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11TRUE16-LABEL: v_neg_rsq_bf16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v1
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x4f800000, v1
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX11TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v1
-; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v1
-; GFX11TRUE16-NEXT: v_fma_f32 v4, -v2, v1, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_fma_f32 v5, -v3, v1, v0
-; GFX11TRUE16-NEXT: v_cmp_ge_f32_e64 s0, 0, v4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0
-; GFX11TRUE16-NEXT: v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_div_scale_f32 v1, null, v0, v0, -1.0
-; GFX11TRUE16-NEXT: v_div_scale_f32 v4, vcc_lo, -1.0, v0, -1.0
-; GFX11TRUE16-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11TRUE16-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fma_f32 v5, -v1, v3, v4
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fma_f32 v1, -v1, v3, v4
-; GFX11TRUE16-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_neg_rsq_bf16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX11FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v1
-; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v3, 1, v1
-; GFX11FAKE16-NEXT: v_fma_f32 v4, -v2, v1, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_fma_f32 v5, -v3, v1, v0
-; GFX11FAKE16-NEXT: v_cmp_ge_f32_e64 s0, 0, v4
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0
-; GFX11FAKE16-NEXT: v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_div_scale_f32 v1, null, v0, v0, -1.0
-; GFX11FAKE16-NEXT: v_div_scale_f32 v4, vcc_lo, -1.0, v0, -1.0
-; GFX11FAKE16-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11FAKE16-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_fma_f32 v5, -v1, v3, v4
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_fma_f32 v1, -v1, v3, v4
-; GFX11FAKE16-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_neg_rsq_bf16:
-; GFX1250TRUE16: ; %bb.0:
-; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
-; GFX1250TRUE16-NEXT: v_nop
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GFX1250TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
-;
-; GFX1250FAKE16-LABEL: v_neg_rsq_bf16:
-; GFX1250FAKE16: ; %bb.0:
-; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
-; GFX1250FAKE16-NEXT: v_nop
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GFX1250FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
- %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
- %rsq = fdiv contract bfloat -1.0, %sqrt
- ret bfloat %rsq
-}
-
declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
@@ -45471,18 +44673,18 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX7-NEXT: v_and_b32_e32 v0, 1, v26
; GFX7-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v27
+; GFX7-NEXT: v_writelane_b32 v33, s34, 0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v28
+; GFX7-NEXT: v_writelane_b32 v33, s35, 1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v29
-; GFX7-NEXT: v_writelane_b32 v33, s30, 0
+; GFX7-NEXT: v_writelane_b32 v33, s30, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v30
-; GFX7-NEXT: v_writelane_b32 v33, s31, 1
+; GFX7-NEXT: v_writelane_b32 v33, s31, 3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; GFX7-NEXT: v_writelane_b32 v33, s34, 2
-; GFX7-NEXT: v_writelane_b32 v33, s35, 3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
@@ -45552,6 +44754,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX7-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7-NEXT: s_mov_b32 s4, 0xffff
+; GFX7-NEXT: v_readlane_b32 s30, v33, 2
; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v3
; GFX7-NEXT: v_bfi_b32 v1, s4, v2, v5
; GFX7-NEXT: v_bfi_b32 v2, s4, v4, v7
@@ -45568,10 +44771,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX7-NEXT: v_bfi_b32 v13, s4, v26, v29
; GFX7-NEXT: v_bfi_b32 v14, s4, v28, v32
; GFX7-NEXT: v_bfi_b32 v15, s4, v31, v30
-; GFX7-NEXT: v_readlane_b32 s35, v33, 3
-; GFX7-NEXT: v_readlane_b32 s34, v33, 2
-; GFX7-NEXT: v_readlane_b32 s31, v33, 1
-; GFX7-NEXT: v_readlane_b32 s30, v33, 0
+; GFX7-NEXT: v_readlane_b32 s31, v33, 3
+; GFX7-NEXT: v_readlane_b32 s35, v33, 1
+; GFX7-NEXT: v_readlane_b32 s34, v33, 0
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
@@ -45627,34 +44829,34 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_and_b32_e32 v0, 1, v20
; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v21
+; GFX8-NEXT: v_writelane_b32 v34, s34, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v22
+; GFX8-NEXT: v_writelane_b32 v34, s35, 1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v23
+; GFX8-NEXT: v_writelane_b32 v34, s36, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v24
+; GFX8-NEXT: v_writelane_b32 v34, s37, 3
; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT: v_writelane_b32 v34, s30, 0
+; GFX8-NEXT: v_writelane_b32 v34, s38, 4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT: v_writelane_b32 v34, s31, 1
+; GFX8-NEXT: v_writelane_b32 v34, s39, 5
; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT: v_writelane_b32 v34, s34, 2
+; GFX8-NEXT: v_writelane_b32 v34, s30, 6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT: v_writelane_b32 v34, s35, 3
+; GFX8-NEXT: v_writelane_b32 v34, s31, 7
; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT: v_writelane_b32 v34, s36, 4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT: v_writelane_b32 v34, s37, 5
; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0
; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT: v_writelane_b32 v34, s38, 6
-; GFX8-NEXT: v_writelane_b32 v34, s39, 7
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0
@@ -45780,6 +44982,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v28
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v24
+; GFX8-NEXT: v_readlane_b32 s30, v34, 6
; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -45788,14 +44991,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_readlane_b32 s39, v34, 7
-; GFX8-NEXT: v_readlane_b32 s38, v34, 6
-; GFX8-NEXT: v_readlane_b32 s37, v34, 5
-; GFX8-NEXT: v_readlane_b32 s36, v34, 4
-; GFX8-NEXT: v_readlane_b32 s35, v34, 3
-; GFX8-NEXT: v_readlane_b32 s34, v34, 2
-; GFX8-NEXT: v_readlane_b32 s31, v34, 1
-; GFX8-NEXT: v_readlane_b32 s30, v34, 0
+; GFX8-NEXT: v_readlane_b32 s31, v34, 7
+; GFX8-NEXT: v_readlane_b32 s39, v34, 5
+; GFX8-NEXT: v_readlane_b32 s38, v34, 4
+; GFX8-NEXT: v_readlane_b32 s37, v34, 3
+; GFX8-NEXT: v_readlane_b32 s36, v34, 2
+; GFX8-NEXT: v_readlane_b32 s35, v34, 1
+; GFX8-NEXT: v_readlane_b32 s34, v34, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -45867,11 +45069,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: v_and_b32_e32 v0, 1, v28
; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32
-; GFX900-NEXT: v_writelane_b32 v33, s30, 0
-; GFX900-NEXT: v_writelane_b32 v33, s31, 1
-; GFX900-NEXT: v_writelane_b32 v33, s34, 2
+; GFX900-NEXT: v_writelane_b32 v33, s34, 0
+; GFX900-NEXT: v_writelane_b32 v33, s35, 1
+; GFX900-NEXT: v_writelane_b32 v33, s30, 2
+; GFX900-NEXT: v_writelane_b32 v33, s31, 3
; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX900-NEXT: v_writelane_b32 v33, s35, 3
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
@@ -45976,6 +45178,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_readlane_b32 s30, v33, 2
; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4
; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4
@@ -45992,10 +45195,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4
; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4
; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4
-; GFX900-NEXT: v_readlane_b32 s35, v33, 3
-; GFX900-NEXT: v_readlane_b32 s34, v33, 2
-; GFX900-NEXT: v_readlane_b32 s31, v33, 1
-; GFX900-NEXT: v_readlane_b32 s30, v33, 0
+; GFX900-NEXT: v_readlane_b32 s31, v33, 3
+; GFX900-NEXT: v_readlane_b32 s35, v33, 1
+; GFX900-NEXT: v_readlane_b32 s34, v33, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index ab2ad19d0f1bf..fb11d3b7d9d65 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -902,47 +902,47 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_waitcnt expcnt(0)
-; CHECK-NEXT: v_writelane_b32 v0, s30, 0
-; CHECK-NEXT: v_writelane_b32 v0, s31, 1
-; CHECK-NEXT: v_writelane_b32 v0, s33, 2
-; CHECK-NEXT: v_writelane_b32 v0, s34, 3
-; CHECK-NEXT: v_writelane_b32 v0, s35, 4
-; CHECK-NEXT: v_writelane_b32 v0, s36, 5
-; CHECK-NEXT: v_writelane_b32 v0, s37, 6
-; CHECK-NEXT: v_writelane_b32 v0, s38, 7
-; CHECK-NEXT: v_writelane_b32 v0, s39, 8
-; CHECK-NEXT: v_writelane_b32 v0, s48, 9
-; CHECK-NEXT: v_writelane_b32 v0, s49, 10
-; CHECK-NEXT: v_writelane_b32 v0, s50, 11
-; CHECK-NEXT: v_writelane_b32 v0, s51, 12
-; CHECK-NEXT: v_writelane_b32 v0, s52, 13
-; CHECK-NEXT: v_writelane_b32 v0, s53, 14
-; CHECK-NEXT: v_writelane_b32 v0, s54, 15
-; CHECK-NEXT: v_writelane_b32 v0, s55, 16
-; CHECK-NEXT: v_writelane_b32 v0, s64, 17
-; CHECK-NEXT: v_writelane_b32 v0, s65, 18
-; CHECK-NEXT: v_writelane_b32 v0, s66, 19
-; CHECK-NEXT: v_writelane_b32 v0, s67, 20
-; CHECK-NEXT: v_writelane_b32 v0, s68, 21
-; CHECK-NEXT: v_writelane_b32 v0, s69, 22
-; CHECK-NEXT: v_writelane_b32 v0, s70, 23
-; CHECK-NEXT: v_writelane_b32 v0, s71, 24
-; CHECK-NEXT: v_writelane_b32 v0, s80, 25
-; CHECK-NEXT: v_writelane_b32 v0, s81, 26
-; CHECK-NEXT: v_writelane_b32 v0, s82, 27
-; CHECK-NEXT: v_writelane_b32 v0, s83, 28
-; CHECK-NEXT: v_writelane_b32 v0, s84, 29
-; CHECK-NEXT: v_writelane_b32 v0, s85, 30
-; CHECK-NEXT: v_writelane_b32 v0, s86, 31
-; CHECK-NEXT: v_writelane_b32 v0, s87, 32
-; CHECK-NEXT: v_writelane_b32 v0, s96, 33
-; CHECK-NEXT: v_writelane_b32 v0, s97, 34
-; CHECK-NEXT: v_writelane_b32 v0, s98, 35
-; CHECK-NEXT: v_writelane_b32 v0, s99, 36
+; CHECK-NEXT: v_writelane_b32 v0, s33, 0
+; CHECK-NEXT: v_writelane_b32 v0, s34, 1
+; CHECK-NEXT: v_writelane_b32 v0, s35, 2
+; CHECK-NEXT: v_writelane_b32 v0, s36, 3
+; CHECK-NEXT: v_writelane_b32 v0, s37, 4
+; CHECK-NEXT: v_writelane_b32 v0, s38, 5
+; CHECK-NEXT: v_writelane_b32 v0, s39, 6
+; CHECK-NEXT: v_writelane_b32 v0, s48, 7
+; CHECK-NEXT: v_writelane_b32 v0, s49, 8
+; CHECK-NEXT: v_writelane_b32 v0, s50, 9
+; CHECK-NEXT: v_writelane_b32 v0, s51, 10
+; CHECK-NEXT: v_writelane_b32 v0, s52, 11
+; CHECK-NEXT: v_writelane_b32 v0, s53, 12
+; CHECK-NEXT: v_writelane_b32 v0, s54, 13
+; CHECK-NEXT: v_writelane_b32 v0, s55, 14
+; CHECK-NEXT: v_writelane_b32 v0, s64, 15
+; CHECK-NEXT: v_writelane_b32 v0, s65, 16
+; CHECK-NEXT: v_writelane_b32 v0, s66, 17
+; CHECK-NEXT: v_writelane_b32 v0, s67, 18
+; CHECK-NEXT: v_writelane_b32 v0, s68, 19
+; CHECK-NEXT: v_writelane_b32 v0, s69, 20
+; CHECK-NEXT: v_writelane_b32 v0, s70, 21
+; CHECK-NEXT: v_writelane_b32 v0, s71, 22
+; CHECK-NEXT: v_writelane_b32 v0, s80, 23
+; CHECK-NEXT: v_writelane_b32 v0, s81, 24
+; CHECK-NEXT: v_writelane_b32 v0, s82, 25
+; CHECK-NEXT: v_writelane_b32 v0, s83, 26
+; CHECK-NEXT: v_writelane_b32 v0, s84, 27
+; CHECK-NEXT: v_writelane_b32 v0, s85, 28
+; CHECK-NEXT: v_writelane_b32 v0, s86, 29
+; CHECK-NEXT: v_writelane_b32 v0, s87, 30
+; CHECK-NEXT: v_writelane_b32 v0, s96, 31
+; CHECK-NEXT: v_writelane_b32 v0, s97, 32
+; CHECK-NEXT: v_writelane_b32 v0, s98, 33
+; CHECK-NEXT: v_writelane_b32 v0, s99, 34
+; CHECK-NEXT: v_writelane_b32 v0, s100, 35
+; CHECK-NEXT: v_writelane_b32 v0, s101, 36
; CHECK-NEXT: s_mov_b32 s40, s12
-; CHECK-NEXT: v_writelane_b32 v0, s100, 37
+; CHECK-NEXT: v_writelane_b32 v0, s30, 37
; CHECK-NEXT: s_cmp_eq_u32 s40, 0
-; CHECK-NEXT: v_writelane_b32 v0, s101, 38
+; CHECK-NEXT: v_writelane_b32 v0, s31, 38
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: ;;#ASMEND
@@ -1380,6 +1380,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; reg use s31
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s30, v0, 37
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; reg use s32
; CHECK-NEXT: ;;#ASMEND
@@ -1596,45 +1597,44 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; reg use vcc_hi
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s101, v0, 38
-; CHECK-NEXT: v_readlane_b32 s100, v0, 37
-; CHECK-NEXT: v_readlane_b32 s99, v0, 36
-; CHECK-NEXT: v_readlane_b32 s98, v0, 35
-; CHECK-NEXT: v_readlane_b32 s97, v0, 34
-; CHECK-NEXT: v_readlane_b32 s96, v0, 33
-; CHECK-NEXT: v_readlane_b32 s87, v0, 32
-; CHECK-NEXT: v_readlane_b32 s86, v0, 31
-; CHECK-NEXT: v_readlane_b32 s85, v0, 30
-; CHECK-NEXT: v_readlane_b32 s84, v0, 29
-; CHECK-NEXT: v_readlane_b32 s83, v0, 28
-; CHECK-NEXT: v_readlane_b32 s82, v0, 27
-; CHECK-NEXT: v_readlane_b32 s81, v0, 26
-; CHECK-NEXT: v_readlane_b32 s80, v0, 25
-; CHECK-NEXT: v_readlane_b32 s71, v0, 24
-; CHECK-NEXT: v_readlane_b32 s70, v0, 23
-; CHECK-NEXT: v_readlane_b32 s69, v0, 22
-; CHECK-NEXT: v_readlane_b32 s68, v0, 21
-; CHECK-NEXT: v_readlane_b32 s67, v0, 20
-; CHECK-NEXT: v_readlane_b32 s66, v0, 19
-; CHECK-NEXT: v_readlane_b32 s65, v0, 18
-; CHECK-NEXT: v_readlane_b32 s64, v0, 17
-; CHECK-NEXT: v_readlane_b32 s55, v0, 16
-; CHECK-NEXT: v_readlane_b32 s54, v0, 15
-; CHECK-NEXT: v_readlane_b32 s53, v0, 14
-; CHECK-NEXT: v_readlane_b32 s52, v0, 13
-; CHECK-NEXT: v_readlane_b32 s51, v0, 12
-; CHECK-NEXT: v_readlane_b32 s50, v0, 11
-; CHECK-NEXT: v_readlane_b32 s49, v0, 10
-; CHECK-NEXT: v_readlane_b32 s48, v0, 9
-; CHECK-NEXT: v_readlane_b32 s39, v0, 8
-; CHECK-NEXT: v_readlane_b32 s38, v0, 7
-; CHECK-NEXT: v_readlane_b32 s37, v0, 6
-; CHECK-NEXT: v_readlane_b32 s36, v0, 5
-; CHECK-NEXT: v_readlane_b32 s35, v0, 4
-; CHECK-NEXT: v_readlane_b32 s34, v0, 3
-; CHECK-NEXT: v_readlane_b32 s33, v0, 2
-; CHECK-NEXT: v_readlane_b32 s31, v0, 1
-; CHECK-NEXT: v_readlane_b32 s30, v0, 0
+; CHECK-NEXT: v_readlane_b32 s31, v0, 38
+; CHECK-NEXT: v_readlane_b32 s101, v0, 36
+; CHECK-NEXT: v_readlane_b32 s100, v0, 35
+; CHECK-NEXT: v_readlane_b32 s99, v0, 34
+; CHECK-NEXT: v_readlane_b32 s98, v0, 33
+; CHECK-NEXT: v_readlane_b32 s97, v0, 32
+; CHECK-NEXT: v_readlane_b32 s96, v0, 31
+; CHECK-NEXT: v_readlane_b32 s87, v0, 30
+; CHECK-NEXT: v_readlane_b32 s86, v0, 29
+; CHECK-NEXT: v_readlane_b32 s85, v0, 28
+; CHECK-NEXT: v_readlane_b32 s84, v0, 27
+; CHECK-NEXT: v_readlane_b32 s83, v0, 26
+; CHECK-NEXT: v_readlane_b32 s82, v0, 25
+; CHECK-NEXT: v_readlane_b32 s81, v0, 24
+; CHECK-NEXT: v_readlane_b32 s80, v0, 23
+; CHECK-NEXT: v_readlane_b32 s71, v0, 22
+; CHECK-NEXT: v_readlane_b32 s70, v0, 21
+; CHECK-NEXT: v_readlane_b32 s69, v0, 20
+; CHECK-NEXT: v_readlane_b32 s68, v0, 19
+; CHECK-NEXT: v_readlane_b32 s67, v0, 18
+; CHECK-NEXT: v_readlane_b32 s66, v0, 17
+; CHECK-NEXT: v_readlane_b32 s65, v0, 16
+; CHECK-NEXT: v_readlane_b32 s64, v0, 15
+; CHECK-NEXT: v_readlane_b32 s55, v0, 14
+; CHECK-NEXT: v_readlane_b32 s54, v0, 13
+; CHECK-NEXT: v_readlane_b32 s53, v0, 12
+; CHECK-NEXT: v_readlane_b32 s52, v0, 11
+; CHECK-NEXT: v_readlane_b32 s51, v0, 10
+; CHECK-NEXT: v_readlane_b32 s50, v0, 9
+; CHECK-NEXT: v_readlane_b32 s49, v0, 8
+; CHECK-NEXT: v_readlane_b32 s48, v0, 7
+; CHECK-NEXT: v_readlane_b32 s39, v0, 6
+; CHECK-NEXT: v_readlane_b32 s38, v0, 5
+; CHECK-NEXT: v_readlane_b32 s37, v0, 4
+; CHECK-NEXT: v_readlane_b32 s36, v0, 3
+; CHECK-NEXT: v_readlane_b32 s35, v0, 2
+; CHECK-NEXT: v_readlane_b32 s34, v0, 1
+; CHECK-NEXT: v_readlane_b32 s33, v0, 0
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
index 6ccdbdea135bc..ec54a359ae7c2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -25,8 +25,8 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -53,8 +53,8 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -84,8 +84,8 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -112,8 +112,8 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
index bd03e092c0fa0..44ae9ea6d2a94 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
@@ -27,8 +27,8 @@ define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #
; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -58,8 +58,8 @@ define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #
; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a16i32_inreg at rel32@hi+12
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -89,8 +89,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inre
; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg_i32_inreg at rel32@hi+12
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index c1b3278144d59..5f59d780c062d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -48,8 +48,8 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -76,8 +76,8 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -107,8 +107,8 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -135,8 +135,8 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -166,8 +166,8 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -194,8 +194,8 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -225,8 +225,8 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -253,8 +253,8 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -284,8 +284,8 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -312,8 +312,8 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -343,8 +343,8 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -371,8 +371,8 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -402,8 +402,8 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -430,8 +430,8 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -461,8 +461,8 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -489,8 +489,8 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -520,8 +520,8 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -548,8 +548,8 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -579,8 +579,8 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -607,8 +607,8 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -638,8 +638,8 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -666,8 +666,8 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -697,8 +697,8 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -725,8 +725,8 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -756,8 +756,8 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -784,8 +784,8 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -815,8 +815,8 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -843,8 +843,8 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -874,8 +874,8 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -902,8 +902,8 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -933,8 +933,8 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -961,8 +961,8 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -992,8 +992,8 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1020,8 +1020,8 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1051,8 +1051,8 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1079,8 +1079,8 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1110,8 +1110,8 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1138,8 +1138,8 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1169,8 +1169,8 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1197,8 +1197,8 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1228,8 +1228,8 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
; GFX9-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[40:41]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1256,8 +1256,8 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[26:27]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1289,8 +1289,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1317,8 +1317,8 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 293e24f2d1b9d..60bad0d70ec24 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -7175,8 +7175,8 @@ define void @stack_12xv3i32() #0 {
; VI-NEXT: v_mov_b32_e32 v30, 10
; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
+; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: s_mov_b32 s32, s33
; VI-NEXT: v_readlane_b32 s4, v40, 2
; VI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7243,8 +7243,8 @@ define void @stack_12xv3i32() #0 {
; CI-NEXT: v_mov_b32_e32 v30, 10
; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
+; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: s_mov_b32 s32, s33
; CI-NEXT: v_readlane_b32 s4, v40, 2
; CI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7311,8 +7311,8 @@ define void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v30, 10
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7361,8 +7361,8 @@ define void @stack_12xv3i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7429,8 +7429,8 @@ define void @stack_12xv3i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v30, 10
; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
+; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: s_mov_b32 s32, s33
; HSA-NEXT: v_readlane_b32 s4, v40, 2
; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7514,8 +7514,8 @@ define void @stack_12xv3f32() #0 {
; VI-NEXT: v_mov_b32_e32 v30, 0x41200000
; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
+; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: s_mov_b32 s32, s33
; VI-NEXT: v_readlane_b32 s4, v40, 2
; VI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7582,8 +7582,8 @@ define void @stack_12xv3f32() #0 {
; CI-NEXT: v_mov_b32_e32 v30, 0x41200000
; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
+; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: s_mov_b32 s32, s33
; CI-NEXT: v_readlane_b32 s4, v40, 2
; CI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7650,8 +7650,8 @@ define void @stack_12xv3f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7704,8 +7704,8 @@ define void @stack_12xv3f32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7772,8 +7772,8 @@ define void @stack_12xv3f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000
; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
+; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: s_mov_b32 s32, s33
; HSA-NEXT: v_readlane_b32 s4, v40, 2
; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7865,8 +7865,8 @@ define void @stack_8xv5i32() #0 {
; VI-NEXT: v_mov_b32_e32 v30, 6
; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
+; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: s_mov_b32 s32, s33
; VI-NEXT: v_readlane_b32 s4, v40, 2
; VI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -7941,8 +7941,8 @@ define void @stack_8xv5i32() #0 {
; CI-NEXT: v_mov_b32_e32 v30, 6
; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
+; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: s_mov_b32 s32, s33
; CI-NEXT: v_readlane_b32 s4, v40, 2
; CI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8017,8 +8017,8 @@ define void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v30, 6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8072,8 +8072,8 @@ define void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8148,8 +8148,8 @@ define void @stack_8xv5i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v30, 6
; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
+; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: s_mov_b32 s32, s33
; HSA-NEXT: v_readlane_b32 s4, v40, 2
; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8237,8 +8237,8 @@ define void @stack_8xv5f32() #0 {
; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000
; VI-NEXT: v_writelane_b32 v40, s31, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
+; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: s_mov_b32 s32, s33
; VI-NEXT: v_readlane_b32 s4, v40, 2
; VI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8313,8 +8313,8 @@ define void @stack_8xv5f32() #0 {
; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000
; CI-NEXT: v_writelane_b32 v40, s31, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
+; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: s_mov_b32 s32, s33
; CI-NEXT: v_readlane_b32 s4, v40, 2
; CI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8389,8 +8389,8 @@ define void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -8447,8 +8447,8 @@ define void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8523,8 +8523,8 @@ define void @stack_8xv5f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000
; HSA-NEXT: v_writelane_b32 v40, s31, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
+; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: s_mov_b32 s32, s33
; HSA-NEXT: v_readlane_b32 s4, v40, 2
; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index b250227735bd3..26727e53d990c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -25,8 +25,8 @@ define void @use_vcc() #1 {
; GCN: v_writelane_b32 v40, s30, 0
; GCN: v_writelane_b32 v40, s31, 1
; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s31, v40, 1
; GCN: v_readlane_b32 s30, v40, 0
+; GCN: v_readlane_b32 s31, v40, 1
; GCN: v_readlane_b32 s4, v40, 2
; GCN: s_mov_b32 s33, s4
; GCN: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index aed1079158154..f9070339093da 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -40,22 +40,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT: v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT: v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 2
; MUBUF-NEXT: s_getpc_b64 s[34:35]
; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 3
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT: v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT: v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT: v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT: v_readlane_b32 s34, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 4
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -74,22 +74,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
-; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2
; FLATSCR-NEXT: s_getpc_b64 s[34:35]
; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -114,20 +114,20 @@ define void @test_func_call_external_void_funcx2() #0 {
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 4
-; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_writelane_b32 v40, s34, 2
-; MUBUF-NEXT: v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT: v_writelane_b32 v40, s35, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 2
; MUBUF-NEXT: s_getpc_b64 s[34:35]
; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 3
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; MUBUF-NEXT: v_readlane_b32 s35, v40, 3
-; MUBUF-NEXT: v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 3
+; MUBUF-NEXT: v_readlane_b32 s35, v40, 1
+; MUBUF-NEXT: v_readlane_b32 s34, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 4
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -146,20 +146,20 @@ define void @test_func_call_external_void_funcx2() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
-; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2
-; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2
; FLATSCR-NEXT: s_getpc_b64 s[34:35]
; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3
-; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 3
+; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -185,8 +185,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: v_readlane_b32 s31, v0, 1
; MUBUF-NEXT: v_readlane_b32 s30, v0, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v0, 1
; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
@@ -204,8 +204,8 @@ define void @void_func_void_clobber_s30_s31() #2 {
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v0, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1
; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
@@ -452,23 +452,23 @@ define void @callee_saved_sgpr_func() #2 {
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v40, s4, 3
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 0
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 1
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: v_writelane_b32 v40, s34, 2
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 2
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; def s40
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: s_mov_b32 s34, s40
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 1
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; use s34
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: v_readlane_b32 s34, v40, 2
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s34, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 3
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -488,23 +488,23 @@ define void @callee_saved_sgpr_func() #2 {
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 3
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 1
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 2
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; def s40
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: s_mov_b32 s34, s40
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 1
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s34
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 3
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -555,13 +555,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: v_writelane_b32 v41, s4, 3
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_writelane_b32 v41, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v41, s31, 1
+; MUBUF-NEXT: v_writelane_b32 v41, s34, 0
+; MUBUF-NEXT: v_writelane_b32 v41, s30, 1
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF-NEXT: v_writelane_b32 v41, s34, 2
+; MUBUF-NEXT: v_writelane_b32 v41, s31, 2
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; def s40
; MUBUF-NEXT: ;;#ASMEND
@@ -577,9 +577,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; MUBUF-NEXT: ; use v40
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF-NEXT: v_readlane_b32 s34, v41, 2
-; MUBUF-NEXT: v_readlane_b32 s31, v41, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v41, 0
+; MUBUF-NEXT: v_readlane_b32 s30, v41, 1
+; MUBUF-NEXT: v_readlane_b32 s31, v41, 2
+; MUBUF-NEXT: v_readlane_b32 s34, v41, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v41, 3
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -599,13 +599,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
-; FLATSCR-NEXT: v_writelane_b32 v41, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v41, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v41, s34, 0
+; FLATSCR-NEXT: v_writelane_b32 v41, s30, 1
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT: v_writelane_b32 v41, s34, 2
+; FLATSCR-NEXT: v_writelane_b32 v41, s31, 2
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; def s40
; FLATSCR-NEXT: ;;#ASMEND
@@ -621,9 +621,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
; FLATSCR-NEXT: ; use v40
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT: v_readlane_b32 s34, v41, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v41, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v41, 0
+; FLATSCR-NEXT: v_readlane_b32 s30, v41, 1
+; FLATSCR-NEXT: v_readlane_b32 s31, v41, 2
+; FLATSCR-NEXT: v_readlane_b32 s34, v41, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v41, 3
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll
index e2ca278d687be..8ae550e5eef9c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-skip.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll
@@ -5,6 +5,10 @@
; A call should be skipped if all lanes are zero, since we don't know
; what side effects should be avoided inside the call.
define hidden void @func() #1 {
+; GCN-LABEL: func:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
ret void
}
@@ -30,8 +34,8 @@ define void @if_call(i32 %flag) #0 {
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: .LBB1_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v1, 1
; GCN-NEXT: v_readlane_b32 s30, v1, 0
+; GCN-NEXT: v_readlane_b32 s31, v1, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index e7254eb5c3465..07f58df81c502 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -132,8 +132,8 @@ define void @callee_with_stack_and_call() #0 {
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -162,8 +162,8 @@ define void @callee_with_stack_and_call() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -201,8 +201,8 @@ define void @callee_no_stack_with_call() #0 {
; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -228,8 +228,8 @@ define void @callee_no_stack_with_call() #0 {
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -359,24 +359,24 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
-; FLATSCR-NEXT: v_writelane_b32 v40, s36, 2
-; FLATSCR-NEXT: v_writelane_b32 v40, s37, 3
-; FLATSCR-NEXT: v_writelane_b32 v40, s38, 4
-; FLATSCR-NEXT: v_writelane_b32 v40, s39, 5
-; FLATSCR-NEXT: v_writelane_b32 v40, s48, 6
-; FLATSCR-NEXT: v_writelane_b32 v40, s49, 7
-; FLATSCR-NEXT: v_writelane_b32 v40, s50, 8
-; FLATSCR-NEXT: v_writelane_b32 v40, s51, 9
-; FLATSCR-NEXT: v_writelane_b32 v40, s52, 10
-; FLATSCR-NEXT: v_writelane_b32 v40, s53, 11
-; FLATSCR-NEXT: v_writelane_b32 v40, s54, 12
-; FLATSCR-NEXT: v_writelane_b32 v40, s55, 13
-; FLATSCR-NEXT: v_writelane_b32 v40, s64, 14
-; FLATSCR-NEXT: v_writelane_b32 v40, s65, 15
-; FLATSCR-NEXT: v_writelane_b32 v40, s66, 16
-; FLATSCR-NEXT: v_writelane_b32 v40, s67, 17
+; FLATSCR-NEXT: v_writelane_b32 v40, s36, 0
+; FLATSCR-NEXT: v_writelane_b32 v40, s37, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s38, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s39, 3
+; FLATSCR-NEXT: v_writelane_b32 v40, s48, 4
+; FLATSCR-NEXT: v_writelane_b32 v40, s49, 5
+; FLATSCR-NEXT: v_writelane_b32 v40, s50, 6
+; FLATSCR-NEXT: v_writelane_b32 v40, s51, 7
+; FLATSCR-NEXT: v_writelane_b32 v40, s52, 8
+; FLATSCR-NEXT: v_writelane_b32 v40, s53, 9
+; FLATSCR-NEXT: v_writelane_b32 v40, s54, 10
+; FLATSCR-NEXT: v_writelane_b32 v40, s55, 11
+; FLATSCR-NEXT: v_writelane_b32 v40, s64, 12
+; FLATSCR-NEXT: v_writelane_b32 v40, s65, 13
+; FLATSCR-NEXT: v_writelane_b32 v40, s66, 14
+; FLATSCR-NEXT: v_writelane_b32 v40, s67, 15
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 17
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: ;;#ASMSTART
@@ -414,6 +414,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s[16:31]
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 16
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s[72:79]
; FLATSCR-NEXT: ;;#ASMEND
@@ -423,24 +424,23 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s[0:15]
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_readlane_b32 s67, v40, 17
-; FLATSCR-NEXT: v_readlane_b32 s66, v40, 16
-; FLATSCR-NEXT: v_readlane_b32 s65, v40, 15
-; FLATSCR-NEXT: v_readlane_b32 s64, v40, 14
-; FLATSCR-NEXT: v_readlane_b32 s55, v40, 13
-; FLATSCR-NEXT: v_readlane_b32 s54, v40, 12
-; FLATSCR-NEXT: v_readlane_b32 s53, v40, 11
-; FLATSCR-NEXT: v_readlane_b32 s52, v40, 10
-; FLATSCR-NEXT: v_readlane_b32 s51, v40, 9
-; FLATSCR-NEXT: v_readlane_b32 s50, v40, 8
-; FLATSCR-NEXT: v_readlane_b32 s49, v40, 7
-; FLATSCR-NEXT: v_readlane_b32 s48, v40, 6
-; FLATSCR-NEXT: v_readlane_b32 s39, v40, 5
-; FLATSCR-NEXT: v_readlane_b32 s38, v40, 4
-; FLATSCR-NEXT: v_readlane_b32 s37, v40, 3
-; FLATSCR-NEXT: v_readlane_b32 s36, v40, 2
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 17
+; FLATSCR-NEXT: v_readlane_b32 s67, v40, 15
+; FLATSCR-NEXT: v_readlane_b32 s66, v40, 14
+; FLATSCR-NEXT: v_readlane_b32 s65, v40, 13
+; FLATSCR-NEXT: v_readlane_b32 s64, v40, 12
+; FLATSCR-NEXT: v_readlane_b32 s55, v40, 11
+; FLATSCR-NEXT: v_readlane_b32 s54, v40, 10
+; FLATSCR-NEXT: v_readlane_b32 s53, v40, 9
+; FLATSCR-NEXT: v_readlane_b32 s52, v40, 8
+; FLATSCR-NEXT: v_readlane_b32 s51, v40, 7
+; FLATSCR-NEXT: v_readlane_b32 s50, v40, 6
+; FLATSCR-NEXT: v_readlane_b32 s49, v40, 5
+; FLATSCR-NEXT: v_readlane_b32 s48, v40, 4
+; FLATSCR-NEXT: v_readlane_b32 s39, v40, 3
+; FLATSCR-NEXT: v_readlane_b32 s38, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s37, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s36, v40, 0
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
@@ -971,14 +971,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: v_writelane_b32 v1, s30, 0
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
-; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_writelane_b32 v1, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
-; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -997,14 +997,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1037,17 +1037,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
-; MUBUF-NEXT: s_addk_i32 s32, 0x300
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: s_addk_i32 s32, 0x300
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved initial VGPRs
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1066,17 +1066,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT: s_add_i32 s32, s32, 12
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_add_i32 s32, s32, 12
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved initial VGPRs
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
@@ -1118,18 +1118,18 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000
-; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved SGPRs
; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; clobber nonpreserved VGPRs
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
-; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100
@@ -1158,11 +1158,11 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved SGPRs
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; clobber nonpreserved VGPRs
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
-; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004
@@ -1220,8 +1220,8 @@ define void @ipra_call_with_stack() #0 {
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
; MUBUF-NEXT: v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v1, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1248,8 +1248,8 @@ define void @ipra_call_with_stack() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v0, s33
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index 5f965ba431ab5..bb5963244da3c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -430,8 +430,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -463,8 +463,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -496,8 +496,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -984,8 +984,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
+; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1048,8 +1048,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GFX90A-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
+; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-NEXT: s_mov_b32 s32, s33
; GFX90A-NEXT: v_readlane_b32 s4, v40, 2
; GFX90A-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1094,8 +1094,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1445,8 +1445,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-NEXT: v_mov_b32_e32 v0, 10
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index bb2f06bfe83f8..f20be656f3af0 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -275,8 +275,8 @@ define void @func_indirect_use_workitem_id_x() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -308,8 +308,8 @@ define void @func_indirect_use_workitem_id_y() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -341,8 +341,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -696,8 +696,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -742,8 +742,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1019,8 +1019,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-NEXT: v_mov_b32_e32 v0, 10
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1469,8 +1469,8 @@ define void @func_call_no_workitem_id_hints() #2 {
; GCN-NEXT: v_mov_b32_e32 v0, 9
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
index 7a241e0e4dff0..6d3a6dc43544b 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-entry.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
@@ -51,8 +51,8 @@ define void @caller() {
; CHECK-NEXT: s_wait_kmcnt 0x0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s0, v40, 2
; CHECK-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
index 86486e56d46ac..4144fafa15684 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
@@ -153,8 +153,8 @@ define i32 @caller_passes_42() {
; SDAG-NEXT: v_writelane_b32 v18, s31, 1
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: s_swappc_b64 s[30:31], s[40:41]
-; SDAG-NEXT: v_readlane_b32 s31, v18, 1
; SDAG-NEXT: v_readlane_b32 s30, v18, 0
+; SDAG-NEXT: v_readlane_b32 s31, v18, 1
; SDAG-NEXT: s_mov_b32 s32, s33
; SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SDAG-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -212,8 +212,8 @@ define i32 @caller_passes_42() {
; GISEL-NEXT: v_writelane_b32 v18, s31, 1
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[40:41]
-; GISEL-NEXT: v_readlane_b32 s31, v18, 1
; GISEL-NEXT: v_readlane_b32 s30, v18, 0
+; GISEL-NEXT: v_readlane_b32 s31, v18, 1
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index ef676ddc8070e..5b37c93ac0bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -378,29 +378,29 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v43, s16, 14
-; GFX9-NEXT: v_writelane_b32 v43, s30, 0
-; GFX9-NEXT: v_writelane_b32 v43, s31, 1
-; GFX9-NEXT: v_writelane_b32 v43, s34, 2
-; GFX9-NEXT: v_writelane_b32 v43, s35, 3
-; GFX9-NEXT: v_writelane_b32 v43, s36, 4
-; GFX9-NEXT: v_writelane_b32 v43, s37, 5
-; GFX9-NEXT: v_writelane_b32 v43, s38, 6
-; GFX9-NEXT: v_writelane_b32 v43, s39, 7
-; GFX9-NEXT: v_writelane_b32 v43, s48, 8
-; GFX9-NEXT: v_writelane_b32 v43, s49, 9
-; GFX9-NEXT: v_writelane_b32 v43, s50, 10
+; GFX9-NEXT: v_writelane_b32 v43, s34, 0
+; GFX9-NEXT: v_writelane_b32 v43, s35, 1
+; GFX9-NEXT: v_writelane_b32 v43, s36, 2
+; GFX9-NEXT: v_writelane_b32 v43, s37, 3
+; GFX9-NEXT: v_writelane_b32 v43, s38, 4
+; GFX9-NEXT: v_writelane_b32 v43, s39, 5
+; GFX9-NEXT: v_writelane_b32 v43, s48, 6
+; GFX9-NEXT: v_writelane_b32 v43, s49, 7
+; GFX9-NEXT: v_writelane_b32 v43, s50, 8
+; GFX9-NEXT: v_writelane_b32 v43, s51, 9
+; GFX9-NEXT: v_writelane_b32 v43, s52, 10
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v43, s51, 11
+; GFX9-NEXT: v_writelane_b32 v43, s53, 11
; GFX9-NEXT: v_mov_b32_e32 v42, v1
-; GFX9-NEXT: v_writelane_b32 v43, s52, 12
+; GFX9-NEXT: v_writelane_b32 v43, s30, 12
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v43, s53, 13
+; GFX9-NEXT: v_writelane_b32 v43, s31, 13
; GFX9-NEXT: v_mov_b32_e32 v40, v31
; GFX9-NEXT: v_mov_b32_e32 v41, v2
; GFX9-NEXT: s_mov_b32 s50, s15
@@ -432,21 +432,21 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: v_readlane_b32 s30, v43, 12
; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX9-NEXT: v_readlane_b32 s53, v43, 13
-; GFX9-NEXT: v_readlane_b32 s52, v43, 12
-; GFX9-NEXT: v_readlane_b32 s51, v43, 11
-; GFX9-NEXT: v_readlane_b32 s50, v43, 10
-; GFX9-NEXT: v_readlane_b32 s49, v43, 9
-; GFX9-NEXT: v_readlane_b32 s48, v43, 8
-; GFX9-NEXT: v_readlane_b32 s39, v43, 7
-; GFX9-NEXT: v_readlane_b32 s38, v43, 6
-; GFX9-NEXT: v_readlane_b32 s37, v43, 5
-; GFX9-NEXT: v_readlane_b32 s36, v43, 4
-; GFX9-NEXT: v_readlane_b32 s35, v43, 3
-; GFX9-NEXT: v_readlane_b32 s34, v43, 2
-; GFX9-NEXT: v_readlane_b32 s31, v43, 1
-; GFX9-NEXT: v_readlane_b32 s30, v43, 0
+; GFX9-NEXT: v_readlane_b32 s31, v43, 13
+; GFX9-NEXT: v_readlane_b32 s53, v43, 11
+; GFX9-NEXT: v_readlane_b32 s52, v43, 10
+; GFX9-NEXT: v_readlane_b32 s51, v43, 9
+; GFX9-NEXT: v_readlane_b32 s50, v43, 8
+; GFX9-NEXT: v_readlane_b32 s49, v43, 7
+; GFX9-NEXT: v_readlane_b32 s48, v43, 6
+; GFX9-NEXT: v_readlane_b32 s39, v43, 5
+; GFX9-NEXT: v_readlane_b32 s38, v43, 4
+; GFX9-NEXT: v_readlane_b32 s37, v43, 3
+; GFX9-NEXT: v_readlane_b32 s36, v43, 2
+; GFX9-NEXT: v_readlane_b32 s35, v43, 1
+; GFX9-NEXT: v_readlane_b32 s34, v43, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v43, 14
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 38c20c7cf62d6..0cab17c9bfcfc 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -40,8 +40,8 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -76,8 +76,8 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32 at rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -112,8 +112,8 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16 at rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -148,8 +148,8 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_addc_u32 s17, s17, func_struct at rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
index 555b24a00f2cf..c6ac9837b8633 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -2005,8 +2005,8 @@ define hidden void @func_call_clobber() #0 {
; GFX900-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX900-NEXT: v_writelane_b32 v40, s31, 1
; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX900-NEXT: v_readlane_b32 s31, v40, 1
; GFX900-NEXT: v_readlane_b32 s30, v40, 0
+; GFX900-NEXT: v_readlane_b32 s31, v40, 1
; GFX900-NEXT: s_mov_b32 s32, s33
; GFX900-NEXT: v_readlane_b32 s4, v40, 2
; GFX900-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -2278,8 +2278,8 @@ define hidden void @func_call_clobber() #0 {
; GFX90A-V2A-DIS-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-V2A-DIS-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-V2A-DIS-NEXT: s_mov_b32 s32, s33
; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s4, v40, 2
; GFX90A-V2A-DIS-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -2551,8 +2551,8 @@ define hidden void @func_call_clobber() #0 {
; GFX90A-V2A-EN-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s31, 1
; GFX90A-V2A-EN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX90A-V2A-EN-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-V2A-EN-NEXT: v_readlane_b32 s30, v40, 0
+; GFX90A-V2A-EN-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-V2A-EN-NEXT: s_mov_b32 s32, s33
; GFX90A-V2A-EN-NEXT: v_readlane_b32 s4, v40, 2
; GFX90A-V2A-EN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -2793,8 +2793,8 @@ define hidden void @func_call_clobber() #0 {
; WAVE32-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; WAVE32-NEXT: v_writelane_b32 v40, s31, 1
; WAVE32-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-NEXT: v_readlane_b32 s31, v40, 1
; WAVE32-NEXT: v_readlane_b32 s30, v40, 0
+; WAVE32-NEXT: v_readlane_b32 s31, v40, 1
; WAVE32-NEXT: s_mov_b32 s32, s33
; WAVE32-NEXT: v_readlane_b32 s4, v40, 2
; WAVE32-NEXT: s_or_saveexec_b32 s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index a0c25b2a0beb3..705d403764503 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -489,22 +489,20 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: v_writelane_b32 v41, s16, 16
; CHECK-NEXT: .cfi_llvm_vector_registers 65, 2601, 16, 32
; CHECK-NEXT: .cfi_def_cfa_register 65
-; CHECK-NEXT: v_writelane_b32 v41, s30, 0
-; CHECK-NEXT: v_writelane_b32 v41, s31, 1
-; CHECK-NEXT: v_writelane_b32 v41, s34, 2
-; CHECK-NEXT: v_writelane_b32 v41, s35, 3
-; CHECK-NEXT: v_writelane_b32 v41, s36, 4
-; CHECK-NEXT: v_writelane_b32 v41, s37, 5
-; CHECK-NEXT: v_writelane_b32 v41, s38, 6
-; CHECK-NEXT: v_writelane_b32 v41, s39, 7
-; CHECK-NEXT: v_writelane_b32 v41, s48, 8
-; CHECK-NEXT: v_writelane_b32 v41, s49, 9
-; CHECK-NEXT: v_writelane_b32 v41, s50, 10
-; CHECK-NEXT: v_writelane_b32 v41, s51, 11
-; CHECK-NEXT: v_writelane_b32 v41, s52, 12
+; CHECK-NEXT: v_writelane_b32 v41, s34, 0
+; CHECK-NEXT: v_writelane_b32 v41, s35, 1
+; CHECK-NEXT: v_writelane_b32 v41, s36, 2
+; CHECK-NEXT: v_writelane_b32 v41, s37, 3
+; CHECK-NEXT: v_writelane_b32 v41, s38, 4
+; CHECK-NEXT: v_writelane_b32 v41, s39, 5
+; CHECK-NEXT: v_writelane_b32 v41, s48, 6
+; CHECK-NEXT: v_writelane_b32 v41, s49, 7
+; CHECK-NEXT: v_writelane_b32 v41, s50, 8
+; CHECK-NEXT: v_writelane_b32 v41, s51, 9
+; CHECK-NEXT: v_writelane_b32 v41, s52, 10
; CHECK-NEXT: s_addk_i32 s32, 0x400
-; CHECK-NEXT: v_writelane_b32 v41, s53, 13
-; CHECK-NEXT: v_writelane_b32 v41, s54, 14
+; CHECK-NEXT: v_writelane_b32 v41, s53, 11
+; CHECK-NEXT: v_writelane_b32 v41, s54, 12
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef
; CHECK-NEXT: .Ltmp0:
@@ -512,10 +510,12 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared at gotpcrel32@hi+12
-; CHECK-NEXT: v_writelane_b32 v41, s55, 15
+; CHECK-NEXT: v_writelane_b32 v41, s55, 13
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v41, s30, 14
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: v_writelane_b32 v41, s31, 15
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -541,23 +541,23 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_readlane_b32 s30, v41, 14
; CHECK-NEXT: flat_store_dword v[0:1], v2
-; CHECK-NEXT: v_readlane_b32 s55, v41, 15
-; CHECK-NEXT: v_readlane_b32 s54, v41, 14
-; CHECK-NEXT: v_readlane_b32 s53, v41, 13
-; CHECK-NEXT: v_readlane_b32 s52, v41, 12
-; CHECK-NEXT: v_readlane_b32 s51, v41, 11
-; CHECK-NEXT: v_readlane_b32 s50, v41, 10
-; CHECK-NEXT: v_readlane_b32 s49, v41, 9
-; CHECK-NEXT: v_readlane_b32 s48, v41, 8
-; CHECK-NEXT: v_readlane_b32 s39, v41, 7
-; CHECK-NEXT: v_readlane_b32 s38, v41, 6
-; CHECK-NEXT: v_readlane_b32 s37, v41, 5
-; CHECK-NEXT: v_readlane_b32 s36, v41, 4
-; CHECK-NEXT: v_readlane_b32 s35, v41, 3
-; CHECK-NEXT: v_readlane_b32 s34, v41, 2
-; CHECK-NEXT: v_readlane_b32 s31, v41, 1
-; CHECK-NEXT: v_readlane_b32 s30, v41, 0
+; CHECK-NEXT: v_readlane_b32 s31, v41, 15
+; CHECK-NEXT: v_readlane_b32 s55, v41, 13
+; CHECK-NEXT: v_readlane_b32 s54, v41, 12
+; CHECK-NEXT: v_readlane_b32 s53, v41, 11
+; CHECK-NEXT: v_readlane_b32 s52, v41, 10
+; CHECK-NEXT: v_readlane_b32 s51, v41, 9
+; CHECK-NEXT: v_readlane_b32 s50, v41, 8
+; CHECK-NEXT: v_readlane_b32 s49, v41, 7
+; CHECK-NEXT: v_readlane_b32 s48, v41, 6
+; CHECK-NEXT: v_readlane_b32 s39, v41, 5
+; CHECK-NEXT: v_readlane_b32 s38, v41, 4
+; CHECK-NEXT: v_readlane_b32 s37, v41, 3
+; CHECK-NEXT: v_readlane_b32 s36, v41, 2
+; CHECK-NEXT: v_readlane_b32 s35, v41, 1
+; CHECK-NEXT: v_readlane_b32 s34, v41, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v41, 16
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index d224cfe27b226..fbacc61492674 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -299,8 +299,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
; CHECK-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; CHECK-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-TRUE16-NEXT: s_mov_b32 s32, s33
; CHECK-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; CHECK-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -339,8 +339,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 {
; CHECK-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
; CHECK-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-FAKE16-NEXT: s_mov_b32 s32, s33
; CHECK-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; CHECK-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
index 1269b2d0f7367..2d620a14da405 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll
@@ -24,34 +24,34 @@ define void @wobble() #0 {
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, 0
; CHECK-NEXT: s_addk_i32 s32, 0x400
-; CHECK-NEXT: v_writelane_b32 v43, s30, 0
-; CHECK-NEXT: v_writelane_b32 v43, s31, 1
-; CHECK-NEXT: v_writelane_b32 v43, s34, 2
-; CHECK-NEXT: v_writelane_b32 v43, s35, 3
+; CHECK-NEXT: v_writelane_b32 v43, s34, 0
+; CHECK-NEXT: v_writelane_b32 v43, s35, 1
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT: v_writelane_b32 v43, s36, 4
-; CHECK-NEXT: v_writelane_b32 v43, s37, 5
+; CHECK-NEXT: v_writelane_b32 v43, s36, 2
+; CHECK-NEXT: v_writelane_b32 v43, s37, 3
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base
; CHECK-NEXT: v_mov_b32_e32 v42, s9
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s38, 4
+; CHECK-NEXT: v_writelane_b32 v43, s39, 5
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT: v_writelane_b32 v43, s48, 8
-; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: v_writelane_b32 v43, s48, 6
+; CHECK-NEXT: v_writelane_b32 v43, s49, 7
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_lshr_b32 s5, s33, 5
; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: v_writelane_b32 v43, s50, 10
+; CHECK-NEXT: v_writelane_b32 v43, s50, 8
; CHECK-NEXT: s_mov_b32 s50, s15
-; CHECK-NEXT: v_writelane_b32 v43, s51, 11
+; CHECK-NEXT: v_writelane_b32 v43, s51, 9
; CHECK-NEXT: s_mov_b32 s51, s14
-; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: v_writelane_b32 v43, s52, 10
; CHECK-NEXT: s_mov_b32 s52, s13
-; CHECK-NEXT: v_writelane_b32 v43, s53, 13
+; CHECK-NEXT: v_writelane_b32 v43, s53, 11
; CHECK-NEXT: s_mov_b32 s53, s12
-; CHECK-NEXT: v_writelane_b32 v43, s54, 14
+; CHECK-NEXT: v_writelane_b32 v43, s54, 12
; CHECK-NEXT: s_add_i32 s54, s5, 16
+; CHECK-NEXT: v_writelane_b32 v43, s30, 13
+; CHECK-NEXT: v_writelane_b32 v43, s31, 14
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_1: ; %bb1
@@ -91,21 +91,21 @@ define void @wobble() #0 {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8
-; CHECK-NEXT: v_readlane_b32 s54, v43, 14
-; CHECK-NEXT: v_readlane_b32 s53, v43, 13
-; CHECK-NEXT: v_readlane_b32 s52, v43, 12
-; CHECK-NEXT: v_readlane_b32 s51, v43, 11
-; CHECK-NEXT: v_readlane_b32 s50, v43, 10
-; CHECK-NEXT: v_readlane_b32 s49, v43, 9
-; CHECK-NEXT: v_readlane_b32 s48, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
-; CHECK-NEXT: v_readlane_b32 s37, v43, 5
-; CHECK-NEXT: v_readlane_b32 s36, v43, 4
-; CHECK-NEXT: v_readlane_b32 s35, v43, 3
-; CHECK-NEXT: v_readlane_b32 s34, v43, 2
-; CHECK-NEXT: v_readlane_b32 s31, v43, 1
-; CHECK-NEXT: v_readlane_b32 s30, v43, 0
+; CHECK-NEXT: v_readlane_b32 s30, v43, 13
+; CHECK-NEXT: v_readlane_b32 s31, v43, 14
+; CHECK-NEXT: v_readlane_b32 s54, v43, 12
+; CHECK-NEXT: v_readlane_b32 s53, v43, 11
+; CHECK-NEXT: v_readlane_b32 s52, v43, 10
+; CHECK-NEXT: v_readlane_b32 s51, v43, 9
+; CHECK-NEXT: v_readlane_b32 s50, v43, 8
+; CHECK-NEXT: v_readlane_b32 s49, v43, 7
+; CHECK-NEXT: v_readlane_b32 s48, v43, 6
+; CHECK-NEXT: v_readlane_b32 s39, v43, 5
+; CHECK-NEXT: v_readlane_b32 s38, v43, 4
+; CHECK-NEXT: v_readlane_b32 s37, v43, 3
+; CHECK-NEXT: v_readlane_b32 s36, v43, 2
+; CHECK-NEXT: v_readlane_b32 s35, v43, 1
+; CHECK-NEXT: v_readlane_b32 s34, v43, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v43, 15
; CHECK-NEXT: s_or_saveexec_b32 s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 76a2114a000cf..cba5aa8ef3672 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -55,8 +55,8 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: v_readlane_b32 s31, v42, 1
; GCN-NEXT: v_readlane_b32 s30, v42, 0
+; GCN-NEXT: v_readlane_b32 s31, v42, 1
; GCN-NEXT: s_mov_b32 s32, s34
; GCN-NEXT: v_readlane_b32 s4, v42, 2
; GCN-NEXT: v_readlane_b32 s34, v42, 3
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 2e88da142bb41..6abe5998d6767 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -26,8 +26,8 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0
+; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1
; SPILL-TO-VGPR-NEXT: s_mov_b32 s32, s33
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2
; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -46,21 +46,14 @@ define void @callee_with_stack_and_call() #0 {
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x800
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s30, 0
+; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s31, 1
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s31, 0
-; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0
; NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5]
; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
@@ -69,20 +62,12 @@ define void @callee_with_stack_and_call() #0 {
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v0, 0
-; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3
; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v0, 0
+; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v0, 1
; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 16c3bb667e06d..95316ad7d66d8 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1737,8 +1737,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -1767,8 +1767,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2156,8 +2156,8 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -2198,8 +2198,8 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 9d137fb4101e4..a2f203a111e18 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -39,47 +39,47 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-NEXT: v_writelane_b32 v40, s27, 23
; SDAG-NEXT: v_writelane_b32 v40, s28, 24
; SDAG-NEXT: v_writelane_b32 v40, s29, 25
-; SDAG-NEXT: v_writelane_b32 v40, s30, 26
-; SDAG-NEXT: v_writelane_b32 v40, s31, 27
-; SDAG-NEXT: v_writelane_b32 v40, s72, 28
-; SDAG-NEXT: v_writelane_b32 v40, s73, 29
-; SDAG-NEXT: v_writelane_b32 v40, s74, 30
-; SDAG-NEXT: v_writelane_b32 v40, s75, 31
-; SDAG-NEXT: v_writelane_b32 v40, s76, 32
-; SDAG-NEXT: v_writelane_b32 v40, s77, 33
-; SDAG-NEXT: v_writelane_b32 v40, s78, 34
-; SDAG-NEXT: v_writelane_b32 v40, s79, 35
-; SDAG-NEXT: v_writelane_b32 v40, s88, 36
-; SDAG-NEXT: v_writelane_b32 v40, s89, 37
-; SDAG-NEXT: v_writelane_b32 v40, s90, 38
-; SDAG-NEXT: v_writelane_b32 v40, s91, 39
-; SDAG-NEXT: v_writelane_b32 v40, s92, 40
-; SDAG-NEXT: v_writelane_b32 v40, s93, 41
-; SDAG-NEXT: v_writelane_b32 v40, s94, 42
+; SDAG-NEXT: v_writelane_b32 v40, s72, 26
+; SDAG-NEXT: v_writelane_b32 v40, s73, 27
+; SDAG-NEXT: v_writelane_b32 v40, s74, 28
+; SDAG-NEXT: v_writelane_b32 v40, s75, 29
+; SDAG-NEXT: v_writelane_b32 v40, s76, 30
+; SDAG-NEXT: v_writelane_b32 v40, s77, 31
+; SDAG-NEXT: v_writelane_b32 v40, s78, 32
+; SDAG-NEXT: v_writelane_b32 v40, s79, 33
+; SDAG-NEXT: v_writelane_b32 v40, s88, 34
+; SDAG-NEXT: v_writelane_b32 v40, s89, 35
+; SDAG-NEXT: v_writelane_b32 v40, s90, 36
+; SDAG-NEXT: v_writelane_b32 v40, s91, 37
+; SDAG-NEXT: v_writelane_b32 v40, s92, 38
+; SDAG-NEXT: v_writelane_b32 v40, s93, 39
+; SDAG-NEXT: v_writelane_b32 v40, s94, 40
+; SDAG-NEXT: v_writelane_b32 v40, s95, 41
+; SDAG-NEXT: v_writelane_b32 v40, s30, 42
; SDAG-NEXT: s_mov_b32 s35, extern_c_func at abs32@hi
; SDAG-NEXT: s_mov_b32 s34, extern_c_func at abs32@lo
; SDAG-NEXT: s_mov_b64 s[8:9], 0
; SDAG-NEXT: s_addk_i32 s32, 0x400
-; SDAG-NEXT: v_writelane_b32 v40, s95, 43
+; SDAG-NEXT: v_writelane_b32 v40, s31, 43
; SDAG-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; SDAG-NEXT: v_readlane_b32 s95, v40, 43
-; SDAG-NEXT: v_readlane_b32 s94, v40, 42
-; SDAG-NEXT: v_readlane_b32 s93, v40, 41
-; SDAG-NEXT: v_readlane_b32 s92, v40, 40
-; SDAG-NEXT: v_readlane_b32 s91, v40, 39
-; SDAG-NEXT: v_readlane_b32 s90, v40, 38
-; SDAG-NEXT: v_readlane_b32 s89, v40, 37
-; SDAG-NEXT: v_readlane_b32 s88, v40, 36
-; SDAG-NEXT: v_readlane_b32 s79, v40, 35
-; SDAG-NEXT: v_readlane_b32 s78, v40, 34
-; SDAG-NEXT: v_readlane_b32 s77, v40, 33
-; SDAG-NEXT: v_readlane_b32 s76, v40, 32
-; SDAG-NEXT: v_readlane_b32 s75, v40, 31
-; SDAG-NEXT: v_readlane_b32 s74, v40, 30
-; SDAG-NEXT: v_readlane_b32 s73, v40, 29
-; SDAG-NEXT: v_readlane_b32 s72, v40, 28
-; SDAG-NEXT: v_readlane_b32 s31, v40, 27
-; SDAG-NEXT: v_readlane_b32 s30, v40, 26
+; SDAG-NEXT: v_readlane_b32 s30, v40, 42
+; SDAG-NEXT: v_readlane_b32 s31, v40, 43
+; SDAG-NEXT: v_readlane_b32 s95, v40, 41
+; SDAG-NEXT: v_readlane_b32 s94, v40, 40
+; SDAG-NEXT: v_readlane_b32 s93, v40, 39
+; SDAG-NEXT: v_readlane_b32 s92, v40, 38
+; SDAG-NEXT: v_readlane_b32 s91, v40, 37
+; SDAG-NEXT: v_readlane_b32 s90, v40, 36
+; SDAG-NEXT: v_readlane_b32 s89, v40, 35
+; SDAG-NEXT: v_readlane_b32 s88, v40, 34
+; SDAG-NEXT: v_readlane_b32 s79, v40, 33
+; SDAG-NEXT: v_readlane_b32 s78, v40, 32
+; SDAG-NEXT: v_readlane_b32 s77, v40, 31
+; SDAG-NEXT: v_readlane_b32 s76, v40, 30
+; SDAG-NEXT: v_readlane_b32 s75, v40, 29
+; SDAG-NEXT: v_readlane_b32 s74, v40, 28
+; SDAG-NEXT: v_readlane_b32 s73, v40, 27
+; SDAG-NEXT: v_readlane_b32 s72, v40, 26
; SDAG-NEXT: v_readlane_b32 s29, v40, 25
; SDAG-NEXT: v_readlane_b32 s28, v40, 24
; SDAG-NEXT: v_readlane_b32 s27, v40, 23
@@ -148,47 +148,47 @@ define amdgpu_gfx void @gfx_func() {
; GISEL-NEXT: v_writelane_b32 v40, s27, 23
; GISEL-NEXT: v_writelane_b32 v40, s28, 24
; GISEL-NEXT: v_writelane_b32 v40, s29, 25
-; GISEL-NEXT: v_writelane_b32 v40, s30, 26
-; GISEL-NEXT: v_writelane_b32 v40, s31, 27
-; GISEL-NEXT: v_writelane_b32 v40, s72, 28
-; GISEL-NEXT: v_writelane_b32 v40, s73, 29
-; GISEL-NEXT: v_writelane_b32 v40, s74, 30
-; GISEL-NEXT: v_writelane_b32 v40, s75, 31
-; GISEL-NEXT: v_writelane_b32 v40, s76, 32
-; GISEL-NEXT: v_writelane_b32 v40, s77, 33
-; GISEL-NEXT: v_writelane_b32 v40, s78, 34
-; GISEL-NEXT: v_writelane_b32 v40, s79, 35
-; GISEL-NEXT: v_writelane_b32 v40, s88, 36
-; GISEL-NEXT: v_writelane_b32 v40, s89, 37
-; GISEL-NEXT: v_writelane_b32 v40, s90, 38
-; GISEL-NEXT: v_writelane_b32 v40, s91, 39
-; GISEL-NEXT: v_writelane_b32 v40, s92, 40
-; GISEL-NEXT: v_writelane_b32 v40, s93, 41
-; GISEL-NEXT: v_writelane_b32 v40, s94, 42
+; GISEL-NEXT: v_writelane_b32 v40, s72, 26
+; GISEL-NEXT: v_writelane_b32 v40, s73, 27
+; GISEL-NEXT: v_writelane_b32 v40, s74, 28
+; GISEL-NEXT: v_writelane_b32 v40, s75, 29
+; GISEL-NEXT: v_writelane_b32 v40, s76, 30
+; GISEL-NEXT: v_writelane_b32 v40, s77, 31
+; GISEL-NEXT: v_writelane_b32 v40, s78, 32
+; GISEL-NEXT: v_writelane_b32 v40, s79, 33
+; GISEL-NEXT: v_writelane_b32 v40, s88, 34
+; GISEL-NEXT: v_writelane_b32 v40, s89, 35
+; GISEL-NEXT: v_writelane_b32 v40, s90, 36
+; GISEL-NEXT: v_writelane_b32 v40, s91, 37
+; GISEL-NEXT: v_writelane_b32 v40, s92, 38
+; GISEL-NEXT: v_writelane_b32 v40, s93, 39
+; GISEL-NEXT: v_writelane_b32 v40, s94, 40
+; GISEL-NEXT: v_writelane_b32 v40, s95, 41
+; GISEL-NEXT: v_writelane_b32 v40, s30, 42
; GISEL-NEXT: s_mov_b32 s34, extern_c_func at abs32@lo
; GISEL-NEXT: s_mov_b32 s35, extern_c_func at abs32@hi
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s95, 43
+; GISEL-NEXT: v_writelane_b32 v40, s31, 43
; GISEL-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GISEL-NEXT: v_readlane_b32 s95, v40, 43
-; GISEL-NEXT: v_readlane_b32 s94, v40, 42
-; GISEL-NEXT: v_readlane_b32 s93, v40, 41
-; GISEL-NEXT: v_readlane_b32 s92, v40, 40
-; GISEL-NEXT: v_readlane_b32 s91, v40, 39
-; GISEL-NEXT: v_readlane_b32 s90, v40, 38
-; GISEL-NEXT: v_readlane_b32 s89, v40, 37
-; GISEL-NEXT: v_readlane_b32 s88, v40, 36
-; GISEL-NEXT: v_readlane_b32 s79, v40, 35
-; GISEL-NEXT: v_readlane_b32 s78, v40, 34
-; GISEL-NEXT: v_readlane_b32 s77, v40, 33
-; GISEL-NEXT: v_readlane_b32 s76, v40, 32
-; GISEL-NEXT: v_readlane_b32 s75, v40, 31
-; GISEL-NEXT: v_readlane_b32 s74, v40, 30
-; GISEL-NEXT: v_readlane_b32 s73, v40, 29
-; GISEL-NEXT: v_readlane_b32 s72, v40, 28
-; GISEL-NEXT: v_readlane_b32 s31, v40, 27
-; GISEL-NEXT: v_readlane_b32 s30, v40, 26
+; GISEL-NEXT: v_readlane_b32 s30, v40, 42
+; GISEL-NEXT: v_readlane_b32 s31, v40, 43
+; GISEL-NEXT: v_readlane_b32 s95, v40, 41
+; GISEL-NEXT: v_readlane_b32 s94, v40, 40
+; GISEL-NEXT: v_readlane_b32 s93, v40, 39
+; GISEL-NEXT: v_readlane_b32 s92, v40, 38
+; GISEL-NEXT: v_readlane_b32 s91, v40, 37
+; GISEL-NEXT: v_readlane_b32 s90, v40, 36
+; GISEL-NEXT: v_readlane_b32 s89, v40, 35
+; GISEL-NEXT: v_readlane_b32 s88, v40, 34
+; GISEL-NEXT: v_readlane_b32 s79, v40, 33
+; GISEL-NEXT: v_readlane_b32 s78, v40, 32
+; GISEL-NEXT: v_readlane_b32 s77, v40, 31
+; GISEL-NEXT: v_readlane_b32 s76, v40, 30
+; GISEL-NEXT: v_readlane_b32 s75, v40, 29
+; GISEL-NEXT: v_readlane_b32 s74, v40, 28
+; GISEL-NEXT: v_readlane_b32 s73, v40, 27
+; GISEL-NEXT: v_readlane_b32 s72, v40, 26
; GISEL-NEXT: v_readlane_b32 s29, v40, 25
; GISEL-NEXT: v_readlane_b32 s28, v40, 24
; GISEL-NEXT: v_readlane_b32 s27, v40, 23
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index b5a5a930000e5..d6e606c96bc75 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -141,8 +141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -170,8 +170,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -200,8 +200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -229,8 +229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -264,8 +264,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -295,8 +295,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -326,8 +326,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -357,8 +357,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -393,8 +393,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -424,8 +424,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -455,8 +455,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -486,8 +486,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -519,8 +519,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -547,8 +547,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -576,8 +576,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -604,8 +604,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -632,8 +632,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -665,8 +665,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -694,8 +694,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -724,8 +724,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -753,8 +753,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -782,8 +782,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -816,8 +816,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -845,8 +845,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -875,8 +875,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -933,8 +933,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -966,8 +966,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -994,8 +994,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1023,8 +1023,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1051,8 +1051,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1079,8 +1079,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1112,8 +1112,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1141,8 +1141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1171,8 +1171,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1200,8 +1200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1229,8 +1229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1263,8 +1263,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1292,8 +1292,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1322,8 +1322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1351,8 +1351,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -1380,8 +1380,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1413,8 +1413,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1441,8 +1441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1470,8 +1470,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1498,8 +1498,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1531,8 +1531,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1560,8 +1560,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1589,8 +1589,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1618,8 +1618,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1652,8 +1652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1682,8 +1682,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1713,8 +1713,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1743,8 +1743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1779,8 +1779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1810,8 +1810,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1840,8 +1840,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1871,8 +1871,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -1907,8 +1907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -1939,8 +1939,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -1970,8 +1970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2002,8 +2002,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2043,8 +2043,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2077,8 +2077,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2109,8 +2109,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2143,8 +2143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2177,8 +2177,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2205,8 +2205,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2234,8 +2234,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -2262,8 +2262,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -2290,8 +2290,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2322,8 +2322,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2350,8 +2350,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2379,8 +2379,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2407,8 +2407,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2440,8 +2440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2469,8 +2469,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2498,8 +2498,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2527,8 +2527,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2561,8 +2561,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2591,8 +2591,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2621,8 +2621,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2651,8 +2651,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2687,8 +2687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2719,8 +2719,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2750,8 +2750,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2782,8 +2782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2815,8 +2815,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2844,8 +2844,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2873,8 +2873,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -2902,8 +2902,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -2937,8 +2937,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -2968,8 +2968,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -2998,8 +2998,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3029,8 +3029,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3066,8 +3066,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3099,8 +3099,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3130,8 +3130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3163,8 +3163,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3200,8 +3200,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3233,8 +3233,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3267,8 +3267,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -3300,8 +3300,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -3333,8 +3333,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3371,8 +3371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3404,8 +3404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3437,8 +3437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3470,8 +3470,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3509,8 +3509,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3543,8 +3543,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3577,8 +3577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3611,8 +3611,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3652,8 +3652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: v_mov_b32_e32 v4, v6
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3688,8 +3688,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v4, v6
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3724,8 +3724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: v_mov_b32_e32 v4, v6
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3760,8 +3760,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, v5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, v6
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3804,8 +3804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX9-NEXT: v_mov_b32_e32 v4, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v8
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3843,8 +3843,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -3881,8 +3881,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -3920,8 +3920,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, v1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, v8
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -3996,8 +3996,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX9-NEXT: v_mov_b32_e32 v18, v33
; GFX9-NEXT: v_mov_b32_e32 v19, v34
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4068,8 +4068,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX10-NEXT: v_mov_b32_e32 v18, v33
; GFX10-NEXT: v_mov_b32_e32 v19, v34
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -4135,8 +4135,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX11-NEXT: v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
; GFX11-NEXT: v_mov_b32_e32 v19, v34
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -4207,8 +4207,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v18, v33
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v19, v34
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -4249,8 +4249,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4285,8 +4285,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX10-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -4322,8 +4322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -4358,8 +4358,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -4394,8 +4394,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX10-SCRATCH-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -4442,8 +4442,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4478,8 +4478,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4522,8 +4522,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
@@ -4566,8 +4566,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4607,8 +4607,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4663,8 +4663,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4701,8 +4701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX10-NEXT: v_mov_b32_e32 v3, 2
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: global_store_byte v[3:4], v2, off
; GFX10-NEXT: global_store_short v[40:41], v0, off
@@ -4745,6 +4745,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-TRUE16-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b8 v[3:4], v2, off
; GFX11-TRUE16-NEXT: global_store_b16 v[40:41], v0, off
@@ -4752,7 +4753,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -4789,8 +4789,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, 2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_store_b8 v[3:4], v2, off
@@ -4833,8 +4833,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0
; GFX10-SCRATCH-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: global_store_byte v[3:4], v2, off
; GFX10-SCRATCH-NEXT: global_store_short v[40:41], v0, off
@@ -4889,8 +4889,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -4927,8 +4927,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
; GFX10-NEXT: v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
@@ -4972,10 +4972,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
; GFX11-TRUE16-NEXT: v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
@@ -5017,8 +5017,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
@@ -5060,8 +5060,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
; GFX10-SCRATCH-NEXT: v_perm_b32 v1, v2, v3, 0xc0c0004
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT: v_lshl_or_b32 v0, v1, 16, v0
@@ -5121,8 +5121,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -5163,8 +5163,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX10-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
; GFX10-NEXT: v_mov_b32_e32 v0, 4
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v5
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
@@ -5208,11 +5208,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v6
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_perm_b32 v5, v0, v1, 0xc0c0004
; GFX11-TRUE16-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v5
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off
@@ -5221,7 +5222,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -5263,8 +5263,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 4
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v5
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2
@@ -5312,8 +5312,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX10-SCRATCH-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_lshl_or_b32 v2, v2, 16, v5
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
@@ -5377,8 +5377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -5422,8 +5422,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX10-NEXT: v_perm_b32 v5, v6, v7, 0xc0c0004
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
; GFX10-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v4
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
@@ -5472,18 +5472,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_perm_b32 v4, v4, v5, 0xc0c0004
; GFX11-TRUE16-NEXT: v_perm_b32 v5, v6, v7, 0xc0c0004
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
; GFX11-TRUE16-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v5, 16, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[0:1], off
; GFX11-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -5528,8 +5528,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v5, 16, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
@@ -5579,8 +5579,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX10-SCRATCH-NEXT: v_perm_b32 v5, v6, v7, 0xc0c0004
; GFX10-SCRATCH-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0004
; GFX10-SCRATCH-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_lshl_or_b32 v1, v5, 16, v4
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_lshl_or_b32 v0, v2, 16, v0
@@ -5699,8 +5699,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v44, 1
; GFX9-NEXT: v_readlane_b32 s30, v44, 0
+; GFX9-NEXT: v_readlane_b32 s31, v44, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v44, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -5806,8 +5806,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-NEXT: v_readlane_b32 s30, v44, 0
+; GFX10-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v44, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -5910,8 +5910,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-NEXT: v_readlane_b32 s30, v44, 0
+; GFX11-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v44, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6017,8 +6017,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:4
; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:8
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:12
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v44, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v44, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6053,8 +6053,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6081,8 +6081,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6110,8 +6110,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6138,8 +6138,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6171,8 +6171,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6199,8 +6199,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6228,8 +6228,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6256,8 +6256,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6289,8 +6289,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6317,8 +6317,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6346,8 +6346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6374,8 +6374,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6408,8 +6408,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6437,8 +6437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6466,8 +6466,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6495,8 +6495,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6528,8 +6528,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6557,8 +6557,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6587,8 +6587,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6616,8 +6616,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6648,8 +6648,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6676,8 +6676,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6705,8 +6705,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6733,8 +6733,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6767,8 +6767,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6796,8 +6796,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6826,8 +6826,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6855,8 +6855,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -6887,8 +6887,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -6915,8 +6915,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -6944,8 +6944,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -6972,8 +6972,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7005,8 +7005,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7033,8 +7033,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7062,8 +7062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7090,8 +7090,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7124,8 +7124,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7153,8 +7153,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7182,8 +7182,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7211,8 +7211,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7245,8 +7245,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7275,8 +7275,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7305,8 +7305,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7335,8 +7335,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7370,8 +7370,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7401,8 +7401,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7431,8 +7431,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7462,8 +7462,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7494,8 +7494,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7522,8 +7522,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7551,8 +7551,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7579,8 +7579,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7615,8 +7615,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7646,8 +7646,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7676,8 +7676,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7707,8 +7707,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7743,8 +7743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7775,8 +7775,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7806,8 +7806,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7838,8 +7838,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -7874,8 +7874,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -7907,8 +7907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -7941,8 +7941,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -7974,8 +7974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8015,8 +8015,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8050,8 +8050,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8082,8 +8082,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8117,8 +8117,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8155,8 +8155,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8190,8 +8190,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8226,8 +8226,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8261,8 +8261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8306,8 +8306,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8345,8 +8345,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8385,8 +8385,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8424,8 +8424,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8472,8 +8472,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8514,8 +8514,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8556,8 +8556,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: scratch_store_b32 off, v32, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8598,8 +8598,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8)
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8641,8 +8641,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: v_readlane_b32 s30, v42, 0
+; GFX9-NEXT: v_readlane_b32 s31, v42, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v42, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8678,8 +8678,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v42, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8715,8 +8715,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX11-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v41, off, s33
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -8752,8 +8752,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
; GFX10-SCRATCH-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8789,8 +8789,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX9-NEXT: s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8822,8 +8822,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_struct_i8_i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -8856,8 +8856,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -8889,8 +8889,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -8922,8 +8922,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -8960,8 +8960,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -8992,8 +8992,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -9026,8 +9026,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -9058,8 +9058,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -9090,8 +9090,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -9135,8 +9135,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -9177,8 +9177,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9219,8 +9219,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s33 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s33 offset:12
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -9259,8 +9259,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_load_u8 v0, off, s33 offset:8
; GFX11-FAKE16-NEXT: scratch_load_b32 v1, off, s33 offset:12
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
@@ -9301,8 +9301,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: scratch_load_ubyte v0, off, s33 offset:8
; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
@@ -9373,8 +9373,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX9-NEXT: v_mov_b32_e32 v2, v17
; GFX9-NEXT: v_mov_b32_e32 v3, v18
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -9424,8 +9424,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-NEXT: v_mov_b32_e32 v2, v17
; GFX10-NEXT: v_mov_b32_e32 v3, v18
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -9472,8 +9472,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v1, v16
; GFX11-NEXT: v_dual_mov_b32 v2, v17 :: v_dual_mov_b32 v3, v18
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -9523,8 +9523,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, v17
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, v18
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -9552,46 +9552,46 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s35, 3
-; GFX9-NEXT: v_writelane_b32 v40, s36, 4
-; GFX9-NEXT: v_writelane_b32 v40, s37, 5
-; GFX9-NEXT: v_writelane_b32 v40, s38, 6
-; GFX9-NEXT: v_writelane_b32 v40, s39, 7
-; GFX9-NEXT: v_writelane_b32 v40, s48, 8
-; GFX9-NEXT: v_writelane_b32 v40, s49, 9
-; GFX9-NEXT: v_writelane_b32 v40, s50, 10
-; GFX9-NEXT: v_writelane_b32 v40, s51, 11
-; GFX9-NEXT: v_writelane_b32 v40, s52, 12
-; GFX9-NEXT: v_writelane_b32 v40, s53, 13
+; GFX9-NEXT: v_writelane_b32 v40, s34, 0
+; GFX9-NEXT: v_writelane_b32 v40, s35, 1
+; GFX9-NEXT: v_writelane_b32 v40, s36, 2
+; GFX9-NEXT: v_writelane_b32 v40, s37, 3
+; GFX9-NEXT: v_writelane_b32 v40, s38, 4
+; GFX9-NEXT: v_writelane_b32 v40, s39, 5
+; GFX9-NEXT: v_writelane_b32 v40, s48, 6
+; GFX9-NEXT: v_writelane_b32 v40, s49, 7
+; GFX9-NEXT: v_writelane_b32 v40, s50, 8
+; GFX9-NEXT: v_writelane_b32 v40, s51, 9
+; GFX9-NEXT: v_writelane_b32 v40, s52, 10
+; GFX9-NEXT: v_writelane_b32 v40, s53, 11
+; GFX9-NEXT: v_writelane_b32 v40, s54, 12
+; GFX9-NEXT: v_writelane_b32 v40, s55, 13
; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: v_writelane_b32 v40, s54, 14
+; GFX9-NEXT: v_writelane_b32 v40, s30, 14
; GFX9-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
; GFX9-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
-; GFX9-NEXT: v_writelane_b32 v40, s55, 15
+; GFX9-NEXT: v_writelane_b32 v40, s31, 15
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s55, v40, 15
-; GFX9-NEXT: v_readlane_b32 s54, v40, 14
-; GFX9-NEXT: v_readlane_b32 s53, v40, 13
-; GFX9-NEXT: v_readlane_b32 s52, v40, 12
-; GFX9-NEXT: v_readlane_b32 s51, v40, 11
-; GFX9-NEXT: v_readlane_b32 s50, v40, 10
-; GFX9-NEXT: v_readlane_b32 s49, v40, 9
-; GFX9-NEXT: v_readlane_b32 s48, v40, 8
-; GFX9-NEXT: v_readlane_b32 s39, v40, 7
-; GFX9-NEXT: v_readlane_b32 s38, v40, 6
-; GFX9-NEXT: v_readlane_b32 s37, v40, 5
-; GFX9-NEXT: v_readlane_b32 s36, v40, 4
-; GFX9-NEXT: v_readlane_b32 s35, v40, 3
-; GFX9-NEXT: v_readlane_b32 s34, v40, 2
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s30, v40, 14
+; GFX9-NEXT: v_readlane_b32 s31, v40, 15
+; GFX9-NEXT: v_readlane_b32 s55, v40, 13
+; GFX9-NEXT: v_readlane_b32 s54, v40, 12
+; GFX9-NEXT: v_readlane_b32 s53, v40, 11
+; GFX9-NEXT: v_readlane_b32 s52, v40, 10
+; GFX9-NEXT: v_readlane_b32 s51, v40, 9
+; GFX9-NEXT: v_readlane_b32 s50, v40, 8
+; GFX9-NEXT: v_readlane_b32 s49, v40, 7
+; GFX9-NEXT: v_readlane_b32 s48, v40, 6
+; GFX9-NEXT: v_readlane_b32 s39, v40, 5
+; GFX9-NEXT: v_readlane_b32 s38, v40, 4
+; GFX9-NEXT: v_readlane_b32 s37, v40, 3
+; GFX9-NEXT: v_readlane_b32 s36, v40, 2
+; GFX9-NEXT: v_readlane_b32 s35, v40, 1
+; GFX9-NEXT: v_readlane_b32 s34, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9613,7 +9613,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s34, 0
; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
; GFX10-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
@@ -9621,38 +9621,38 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_writelane_b32 v40, s35, 3
-; GFX10-NEXT: v_writelane_b32 v40, s36, 4
-; GFX10-NEXT: v_writelane_b32 v40, s37, 5
-; GFX10-NEXT: v_writelane_b32 v40, s38, 6
-; GFX10-NEXT: v_writelane_b32 v40, s39, 7
-; GFX10-NEXT: v_writelane_b32 v40, s48, 8
-; GFX10-NEXT: v_writelane_b32 v40, s49, 9
-; GFX10-NEXT: v_writelane_b32 v40, s50, 10
-; GFX10-NEXT: v_writelane_b32 v40, s51, 11
-; GFX10-NEXT: v_writelane_b32 v40, s52, 12
-; GFX10-NEXT: v_writelane_b32 v40, s53, 13
-; GFX10-NEXT: v_writelane_b32 v40, s54, 14
-; GFX10-NEXT: v_writelane_b32 v40, s55, 15
+; GFX10-NEXT: v_writelane_b32 v40, s35, 1
+; GFX10-NEXT: v_writelane_b32 v40, s36, 2
+; GFX10-NEXT: v_writelane_b32 v40, s37, 3
+; GFX10-NEXT: v_writelane_b32 v40, s38, 4
+; GFX10-NEXT: v_writelane_b32 v40, s39, 5
+; GFX10-NEXT: v_writelane_b32 v40, s48, 6
+; GFX10-NEXT: v_writelane_b32 v40, s49, 7
+; GFX10-NEXT: v_writelane_b32 v40, s50, 8
+; GFX10-NEXT: v_writelane_b32 v40, s51, 9
+; GFX10-NEXT: v_writelane_b32 v40, s52, 10
+; GFX10-NEXT: v_writelane_b32 v40, s53, 11
+; GFX10-NEXT: v_writelane_b32 v40, s54, 12
+; GFX10-NEXT: v_writelane_b32 v40, s55, 13
+; GFX10-NEXT: v_writelane_b32 v40, s30, 14
+; GFX10-NEXT: v_writelane_b32 v40, s31, 15
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_readlane_b32 s55, v40, 15
-; GFX10-NEXT: v_readlane_b32 s54, v40, 14
-; GFX10-NEXT: v_readlane_b32 s53, v40, 13
-; GFX10-NEXT: v_readlane_b32 s52, v40, 12
-; GFX10-NEXT: v_readlane_b32 s51, v40, 11
-; GFX10-NEXT: v_readlane_b32 s50, v40, 10
-; GFX10-NEXT: v_readlane_b32 s49, v40, 9
-; GFX10-NEXT: v_readlane_b32 s48, v40, 8
-; GFX10-NEXT: v_readlane_b32 s39, v40, 7
-; GFX10-NEXT: v_readlane_b32 s38, v40, 6
-; GFX10-NEXT: v_readlane_b32 s37, v40, 5
-; GFX10-NEXT: v_readlane_b32 s36, v40, 4
-; GFX10-NEXT: v_readlane_b32 s35, v40, 3
-; GFX10-NEXT: v_readlane_b32 s34, v40, 2
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
-; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s30, v40, 14
+; GFX10-NEXT: v_readlane_b32 s31, v40, 15
+; GFX10-NEXT: v_readlane_b32 s55, v40, 13
+; GFX10-NEXT: v_readlane_b32 s54, v40, 12
+; GFX10-NEXT: v_readlane_b32 s53, v40, 11
+; GFX10-NEXT: v_readlane_b32 s52, v40, 10
+; GFX10-NEXT: v_readlane_b32 s51, v40, 9
+; GFX10-NEXT: v_readlane_b32 s50, v40, 8
+; GFX10-NEXT: v_readlane_b32 s49, v40, 7
+; GFX10-NEXT: v_readlane_b32 s48, v40, 6
+; GFX10-NEXT: v_readlane_b32 s39, v40, 5
+; GFX10-NEXT: v_readlane_b32 s38, v40, 4
+; GFX10-NEXT: v_readlane_b32 s37, v40, 3
+; GFX10-NEXT: v_readlane_b32 s36, v40, 2
+; GFX10-NEXT: v_readlane_b32 s35, v40, 1
+; GFX10-NEXT: v_readlane_b32 s34, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
@@ -9673,44 +9673,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16
; GFX11-NEXT: scratch_load_b32 v31, off, s33
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s34, 0
; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
; GFX11-NEXT: s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-NEXT: v_writelane_b32 v40, s49, 9
-; GFX11-NEXT: v_writelane_b32 v40, s50, 10
-; GFX11-NEXT: v_writelane_b32 v40, s51, 11
-; GFX11-NEXT: v_writelane_b32 v40, s52, 12
-; GFX11-NEXT: v_writelane_b32 v40, s53, 13
-; GFX11-NEXT: v_writelane_b32 v40, s54, 14
-; GFX11-NEXT: v_writelane_b32 v40, s55, 15
+; GFX11-NEXT: v_writelane_b32 v40, s35, 1
+; GFX11-NEXT: v_writelane_b32 v40, s36, 2
+; GFX11-NEXT: v_writelane_b32 v40, s37, 3
+; GFX11-NEXT: v_writelane_b32 v40, s38, 4
+; GFX11-NEXT: v_writelane_b32 v40, s39, 5
+; GFX11-NEXT: v_writelane_b32 v40, s48, 6
+; GFX11-NEXT: v_writelane_b32 v40, s49, 7
+; GFX11-NEXT: v_writelane_b32 v40, s50, 8
+; GFX11-NEXT: v_writelane_b32 v40, s51, 9
+; GFX11-NEXT: v_writelane_b32 v40, s52, 10
+; GFX11-NEXT: v_writelane_b32 v40, s53, 11
+; GFX11-NEXT: v_writelane_b32 v40, s54, 12
+; GFX11-NEXT: v_writelane_b32 v40, s55, 13
+; GFX11-NEXT: v_writelane_b32 v40, s30, 14
+; GFX11-NEXT: v_writelane_b32 v40, s31, 15
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s55, v40, 15
-; GFX11-NEXT: v_readlane_b32 s54, v40, 14
-; GFX11-NEXT: v_readlane_b32 s53, v40, 13
-; GFX11-NEXT: v_readlane_b32 s52, v40, 12
-; GFX11-NEXT: v_readlane_b32 s51, v40, 11
-; GFX11-NEXT: v_readlane_b32 s50, v40, 10
-; GFX11-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s30, v40, 14
+; GFX11-NEXT: v_readlane_b32 s31, v40, 15
+; GFX11-NEXT: v_readlane_b32 s55, v40, 13
+; GFX11-NEXT: v_readlane_b32 s54, v40, 12
+; GFX11-NEXT: v_readlane_b32 s53, v40, 11
+; GFX11-NEXT: v_readlane_b32 s52, v40, 10
+; GFX11-NEXT: v_readlane_b32 s51, v40, 9
+; GFX11-NEXT: v_readlane_b32 s50, v40, 8
+; GFX11-NEXT: v_readlane_b32 s49, v40, 7
+; GFX11-NEXT: v_readlane_b32 s48, v40, 6
+; GFX11-NEXT: v_readlane_b32 s39, v40, 5
+; GFX11-NEXT: v_readlane_b32 s38, v40, 4
+; GFX11-NEXT: v_readlane_b32 s37, v40, 3
+; GFX11-NEXT: v_readlane_b32 s36, v40, 2
+; GFX11-NEXT: v_readlane_b32 s35, v40, 1
+; GFX11-NEXT: v_readlane_b32 s34, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -9731,44 +9731,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16
; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, byval_align16_f64_arg at abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, byval_align16_f64_arg at abs32@lo
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s35, 3
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 5
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 6
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 7
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 8
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 9
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 10
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 11
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 13
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 14
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 15
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s35, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 7
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 9
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 10
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 11
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 12
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 13
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 14
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 15
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1)
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 15
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 14
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 13
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 12
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 11
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 10
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 9
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 8
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 7
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 6
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 5
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 4
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s35, v40, 3
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 2
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 14
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 15
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 13
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 12
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 11
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 10
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 9
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 7
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s35, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload
@@ -9802,8 +9802,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -9831,8 +9831,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -9861,8 +9861,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -9890,8 +9890,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -9923,8 +9923,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -9953,8 +9953,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -9984,8 +9984,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -10014,8 +10014,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -10048,8 +10048,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -10078,8 +10078,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -10109,8 +10109,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -10139,8 +10139,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -10173,8 +10173,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -10203,8 +10203,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -10234,8 +10234,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -10264,8 +10264,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -10300,8 +10300,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -10333,8 +10333,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -10367,8 +10367,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -10400,8 +10400,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -10439,8 +10439,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -10476,8 +10476,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -10514,8 +10514,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -10551,8 +10551,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -10595,8 +10595,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -10634,8 +10634,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -10674,8 +10674,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -10713,8 +10713,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -10758,8 +10758,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 7
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 7
; GFX9-NEXT: v_readlane_b32 s30, v40, 6
+; GFX9-NEXT: v_readlane_b32 s31, v40, 7
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
; GFX9-NEXT: v_readlane_b32 s8, v40, 4
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
@@ -10801,8 +10801,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-NEXT: v_writelane_b32 v40, s31, 7
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-NEXT: v_readlane_b32 s30, v40, 6
+; GFX10-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
; GFX10-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
@@ -10845,8 +10845,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 7
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s30, v40, 6
+; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -10888,8 +10888,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
@@ -10942,8 +10942,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
+; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s11, v40, 7
; GFX9-NEXT: v_readlane_b32 s10, v40, 6
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
@@ -10991,8 +10991,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
@@ -11041,8 +11041,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
; GFX11-NEXT: v_readlane_b32 s10, v40, 6
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
@@ -11090,8 +11090,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
@@ -11133,8 +11133,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -11163,8 +11163,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -11194,8 +11194,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -11224,8 +11224,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -11258,8 +11258,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -11288,8 +11288,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -11319,8 +11319,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -11349,8 +11349,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -11385,8 +11385,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -11418,8 +11418,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -11452,8 +11452,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -11485,8 +11485,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -11524,8 +11524,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 4
; GFX9-NEXT: v_readlane_b32 s30, v40, 3
+; GFX9-NEXT: v_readlane_b32 s31, v40, 4
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
@@ -11560,8 +11560,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-NEXT: v_writelane_b32 v40, s31, 4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-NEXT: v_readlane_b32 s30, v40, 3
+; GFX10-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
@@ -11597,8 +11597,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 4
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s30, v40, 3
+; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -11633,8 +11633,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
@@ -11677,8 +11677,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 6
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 6
; GFX9-NEXT: v_readlane_b32 s30, v40, 5
+; GFX9-NEXT: v_readlane_b32 s31, v40, 6
; GFX9-NEXT: v_readlane_b32 s8, v40, 4
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
@@ -11719,8 +11719,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-NEXT: v_readlane_b32 s30, v40, 5
+; GFX10-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
@@ -11762,8 +11762,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 6
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s30, v40, 5
+; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
@@ -11804,8 +11804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
@@ -11844,8 +11844,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -11877,8 +11877,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -11911,8 +11911,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -11944,8 +11944,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -11985,8 +11985,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -12024,8 +12024,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -12064,8 +12064,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -12103,8 +12103,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -12150,8 +12150,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 7
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 7
; GFX9-NEXT: v_readlane_b32 s30, v40, 6
+; GFX9-NEXT: v_readlane_b32 s31, v40, 7
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
; GFX9-NEXT: v_readlane_b32 s8, v40, 4
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
@@ -12195,8 +12195,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-NEXT: v_writelane_b32 v40, s31, 7
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-NEXT: v_readlane_b32 s30, v40, 6
+; GFX10-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
; GFX10-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
@@ -12241,8 +12241,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 7
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s30, v40, 6
+; GFX11-NEXT: v_readlane_b32 s31, v40, 7
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
@@ -12286,8 +12286,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
@@ -12325,8 +12325,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -12355,8 +12355,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -12386,8 +12386,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -12416,8 +12416,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -12452,8 +12452,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -12484,8 +12484,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -12517,8 +12517,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -12549,8 +12549,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -12586,8 +12586,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -12618,8 +12618,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -12651,8 +12651,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -12683,8 +12683,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -12721,8 +12721,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -12754,8 +12754,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -12788,8 +12788,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -12821,8 +12821,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -12858,8 +12858,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -12891,8 +12891,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -12925,8 +12925,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -12958,8 +12958,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -12994,8 +12994,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13026,8 +13026,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13059,8 +13059,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13091,8 +13091,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13129,8 +13129,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13162,8 +13162,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13196,8 +13196,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13229,8 +13229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13264,8 +13264,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -13294,8 +13294,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -13325,8 +13325,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -13355,8 +13355,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
@@ -13391,8 +13391,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13423,8 +13423,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13456,8 +13456,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13488,8 +13488,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13526,8 +13526,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -13559,8 +13559,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -13593,8 +13593,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -13626,8 +13626,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
@@ -13665,8 +13665,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 4
; GFX9-NEXT: v_readlane_b32 s30, v40, 3
+; GFX9-NEXT: v_readlane_b32 s31, v40, 4
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
@@ -13701,8 +13701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-NEXT: v_writelane_b32 v40, s31, 4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-NEXT: v_readlane_b32 s30, v40, 3
+; GFX10-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
@@ -13738,8 +13738,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 4
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s30, v40, 3
+; GFX11-NEXT: v_readlane_b32 s31, v40, 4
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
@@ -13774,8 +13774,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
@@ -13816,8 +13816,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -13855,8 +13855,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -13895,8 +13895,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -13934,8 +13934,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -13974,8 +13974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -14010,8 +14010,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -14047,8 +14047,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -14083,8 +14083,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -14127,8 +14127,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 5
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s30, v40, 4
+; GFX9-NEXT: v_readlane_b32 s31, v40, 5
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
@@ -14166,8 +14166,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
@@ -14206,8 +14206,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 5
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s30, v40, 4
+; GFX11-NEXT: v_readlane_b32 s31, v40, 5
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
@@ -14245,8 +14245,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
@@ -14290,8 +14290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 6
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 6
; GFX9-NEXT: v_readlane_b32 s30, v40, 5
+; GFX9-NEXT: v_readlane_b32 s31, v40, 6
; GFX9-NEXT: v_readlane_b32 s8, v40, 4
; GFX9-NEXT: v_readlane_b32 s7, v40, 3
; GFX9-NEXT: v_readlane_b32 s6, v40, 2
@@ -14332,8 +14332,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-NEXT: v_readlane_b32 s30, v40, 5
+; GFX10-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-NEXT: v_readlane_b32 s6, v40, 2
@@ -14375,8 +14375,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 6
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s30, v40, 5
+; GFX11-NEXT: v_readlane_b32 s31, v40, 6
; GFX11-NEXT: v_readlane_b32 s8, v40, 4
; GFX11-NEXT: v_readlane_b32 s7, v40, 3
; GFX11-NEXT: v_readlane_b32 s6, v40, 2
@@ -14417,8 +14417,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2
@@ -14464,8 +14464,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
+; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s11, v40, 7
; GFX9-NEXT: v_readlane_b32 s10, v40, 6
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
@@ -14510,8 +14510,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
@@ -14557,8 +14557,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
; GFX11-NEXT: v_readlane_b32 s10, v40, 6
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
@@ -14603,8 +14603,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
@@ -14660,8 +14660,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 9
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s30, v40, 8
+; GFX9-NEXT: v_readlane_b32 s31, v40, 9
; GFX9-NEXT: v_readlane_b32 s11, v40, 7
; GFX9-NEXT: v_readlane_b32 s10, v40, 6
; GFX9-NEXT: v_readlane_b32 s9, v40, 5
@@ -14711,8 +14711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-NEXT: v_readlane_b32 s9, v40, 5
@@ -14763,8 +14763,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 9
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s30, v40, 8
+; GFX11-NEXT: v_readlane_b32 s31, v40, 9
; GFX11-NEXT: v_readlane_b32 s11, v40, 7
; GFX11-NEXT: v_readlane_b32 s10, v40, 6
; GFX11-NEXT: v_readlane_b32 s9, v40, 5
@@ -14814,8 +14814,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5
@@ -14872,8 +14872,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 17
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 17
; GFX9-NEXT: v_readlane_b32 s30, v40, 16
+; GFX9-NEXT: v_readlane_b32 s31, v40, 17
; GFX9-NEXT: v_readlane_b32 s19, v40, 15
; GFX9-NEXT: v_readlane_b32 s18, v40, 14
; GFX9-NEXT: v_readlane_b32 s17, v40, 13
@@ -14934,8 +14934,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 16
; GFX10-NEXT: v_writelane_b32 v40, s31, 17
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 17
; GFX10-NEXT: v_readlane_b32 s30, v40, 16
+; GFX10-NEXT: v_readlane_b32 s31, v40, 17
; GFX10-NEXT: v_readlane_b32 s19, v40, 15
; GFX10-NEXT: v_readlane_b32 s18, v40, 14
; GFX10-NEXT: v_readlane_b32 s17, v40, 13
@@ -14997,8 +14997,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 17
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 17
; GFX11-NEXT: v_readlane_b32 s30, v40, 16
+; GFX11-NEXT: v_readlane_b32 s31, v40, 17
; GFX11-NEXT: v_readlane_b32 s19, v40, 15
; GFX11-NEXT: v_readlane_b32 s18, v40, 14
; GFX11-NEXT: v_readlane_b32 s17, v40, 13
@@ -15059,8 +15059,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 16
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17
; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15
; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14
; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13
@@ -15161,8 +15161,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-NEXT: v_writelane_b32 v40, s31, 27
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 27
; GFX9-NEXT: v_readlane_b32 s30, v40, 26
+; GFX9-NEXT: v_readlane_b32 s31, v40, 27
; GFX9-NEXT: v_readlane_b32 s29, v40, 25
; GFX9-NEXT: v_readlane_b32 s28, v40, 24
; GFX9-NEXT: v_readlane_b32 s27, v40, 23
@@ -15268,8 +15268,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 26
; GFX10-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-NEXT: v_readlane_b32 s30, v40, 26
+; GFX10-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-NEXT: v_readlane_b32 s29, v40, 25
; GFX10-NEXT: v_readlane_b32 s28, v40, 24
; GFX10-NEXT: v_readlane_b32 s27, v40, 23
@@ -15371,8 +15371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 27
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s30, v40, 26
+; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s29, v40, 25
; GFX11-NEXT: v_readlane_b32 s28, v40, 24
; GFX11-NEXT: v_readlane_b32 s27, v40, 23
@@ -15475,8 +15475,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25
; GFX10-SCRATCH-NEXT: v_readlane_b32 s28, v40, 24
; GFX10-SCRATCH-NEXT: v_readlane_b32 s27, v40, 23
@@ -15592,8 +15592,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s31, 27
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 27
; GFX9-NEXT: v_readlane_b32 s30, v40, 26
+; GFX9-NEXT: v_readlane_b32 s31, v40, 27
; GFX9-NEXT: v_readlane_b32 s29, v40, 25
; GFX9-NEXT: v_readlane_b32 s28, v40, 24
; GFX9-NEXT: v_readlane_b32 s27, v40, 23
@@ -15704,8 +15704,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 26
; GFX10-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-NEXT: v_readlane_b32 s30, v40, 26
+; GFX10-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-NEXT: v_readlane_b32 s29, v40, 25
; GFX10-NEXT: v_readlane_b32 s28, v40, 24
; GFX10-NEXT: v_readlane_b32 s27, v40, 23
@@ -15811,8 +15811,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 27
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s30, v40, 26
+; GFX11-NEXT: v_readlane_b32 s31, v40, 27
; GFX11-NEXT: v_readlane_b32 s29, v40, 25
; GFX11-NEXT: v_readlane_b32 s28, v40, 24
; GFX11-NEXT: v_readlane_b32 s27, v40, 23
@@ -15921,8 +15921,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25
; GFX10-SCRATCH-NEXT: v_readlane_b32 s28, v40, 24
; GFX10-SCRATCH-NEXT: v_readlane_b32 s27, v40, 23
@@ -15987,8 +15987,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -16021,8 +16021,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -16051,8 +16051,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -16081,8 +16081,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -16153,8 +16153,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v31, 11
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -16220,8 +16220,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_12xv3i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -16267,8 +16267,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -16331,8 +16331,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_12xv3i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -16423,8 +16423,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v31, 7
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -16498,8 +16498,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_8xv5i32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -16549,8 +16549,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -16619,8 +16619,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_8xv5i32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -16707,8 +16707,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -16782,8 +16782,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-NEXT: s_mov_b32 s34, external_void_func_8xv5f32 at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -16838,8 +16838,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -16908,8 +16908,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_8xv5f32 at abs32@lo
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -16948,8 +16948,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -16975,8 +16975,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17003,8 +17003,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17030,8 +17030,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17062,8 +17062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17089,8 +17089,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17117,8 +17117,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17144,8 +17144,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17176,8 +17176,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17203,8 +17203,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17231,8 +17231,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17258,8 +17258,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17290,8 +17290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17317,8 +17317,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17345,8 +17345,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17372,8 +17372,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17404,8 +17404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17431,8 +17431,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17459,8 +17459,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17486,8 +17486,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17518,8 +17518,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17545,8 +17545,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17573,8 +17573,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17600,8 +17600,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17632,8 +17632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17659,8 +17659,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17687,8 +17687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17714,8 +17714,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17746,8 +17746,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17773,8 +17773,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17801,8 +17801,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17828,8 +17828,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17860,8 +17860,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -17887,8 +17887,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -17915,8 +17915,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -17942,8 +17942,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -17974,8 +17974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18001,8 +18001,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18029,8 +18029,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18056,8 +18056,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18088,8 +18088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18115,8 +18115,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18143,8 +18143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18170,8 +18170,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18202,8 +18202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18229,8 +18229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18257,8 +18257,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18284,8 +18284,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18316,8 +18316,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18343,8 +18343,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18371,8 +18371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18398,8 +18398,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
@@ -18430,8 +18430,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -18457,8 +18457,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -18485,8 +18485,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -18512,8 +18512,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 0005e8a2619b2..260398a519660 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -26,8 +26,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
+; GFX9-NEXT: v_readlane_b32 s31, v40, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
@@ -60,8 +60,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
@@ -95,8 +95,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s30, v40, 2
+; GFX11-NEXT: v_readlane_b32 s31, v40, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
@@ -130,8 +130,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; clobber
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: v_readlane_b32 s31, v0, 3
; GFX9-NEXT: v_readlane_b32 s30, v0, 2
+; GFX9-NEXT: v_readlane_b32 s31, v0, 3
; GFX9-NEXT: v_readlane_b32 s29, v0, 1
; GFX9-NEXT: v_readlane_b32 s28, v0, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1
@@ -157,8 +157,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; clobber
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_readlane_b32 s31, v0, 3
; GFX10-NEXT: v_readlane_b32 s30, v0, 2
+; GFX10-NEXT: v_readlane_b32 s31, v0, 3
; GFX10-NEXT: v_readlane_b32 s29, v0, 1
; GFX10-NEXT: v_readlane_b32 s28, v0, 0
; GFX10-NEXT: s_xor_saveexec_b32 s34, -1
@@ -185,8 +185,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
; GFX11-NEXT: ; clobber
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v0, 3
; GFX11-NEXT: v_readlane_b32 s30, v0, 2
+; GFX11-NEXT: v_readlane_b32 s31, v0, 3
; GFX11-NEXT: v_readlane_b32 s29, v0, 1
; GFX11-NEXT: v_readlane_b32 s28, v0, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
@@ -224,8 +224,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s31
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
+; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -261,8 +261,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s31
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
+; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -298,8 +298,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s31
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
+; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -341,8 +341,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX9-NEXT: ; use v31
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: v_readlane_b32 s30, v41, 0
+; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v41, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -378,8 +378,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX10-NEXT: ; use v31
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: v_readlane_b32 s30, v41, 0
+; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v41, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -416,8 +416,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
; GFX11-NEXT: ; use v31
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: v_readlane_b32 s30, v41, 0
+; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v41, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -455,11 +455,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s33
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
-; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -492,11 +492,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_mov_b32 s33, s4
+; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s33
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
-; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -529,12 +529,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_mov_b32 s33, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s33
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
-; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -572,11 +572,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: s_mov_b32 s34, s4
+; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s34
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
-; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -609,11 +609,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_mov_b32 s34, s4
+; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s34
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
-; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -645,13 +645,13 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: s_mov_b32 s34, s4
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s34
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
-; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -691,8 +691,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX9-NEXT: ; use v40
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: v_readlane_b32 s30, v41, 0
+; GFX9-NEXT: v_readlane_b32 s31, v41, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v41, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -726,8 +726,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX10-NEXT: ; use v40
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: v_readlane_b32 s30, v41, 0
+; GFX10-NEXT: v_readlane_b32 s31, v41, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v41, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -761,8 +761,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
; GFX11-NEXT: ; use v40
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: v_readlane_b32 s30, v41, 0
+; GFX11-NEXT: v_readlane_b32 s31, v41, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v41, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -849,8 +849,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -876,8 +876,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -934,8 +934,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -961,8 +961,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
@@ -989,8 +989,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -1024,11 +1024,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_mov_b32 s4, s40
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s4
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
-; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
@@ -1060,11 +1060,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
-; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
@@ -1096,12 +1096,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s4
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
-; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
@@ -1150,8 +1150,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX9-NEXT: ; use v40
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v41, 2
; GFX9-NEXT: v_readlane_b32 s30, v41, 1
+; GFX9-NEXT: v_readlane_b32 s31, v41, 2
; GFX9-NEXT: v_readlane_b32 s4, v41, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s34, v41, 3
@@ -1195,8 +1195,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX10-NEXT: ; use v40
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: v_readlane_b32 s31, v41, 2
; GFX10-NEXT: v_readlane_b32 s30, v41, 1
+; GFX10-NEXT: v_readlane_b32 s31, v41, 2
; GFX10-NEXT: v_readlane_b32 s4, v41, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s34, v41, 3
@@ -1240,8 +1240,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
; GFX11-NEXT: ; use v40
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: v_readlane_b32 s31, v41, 2
; GFX11-NEXT: v_readlane_b32 s30, v41, 1
+; GFX11-NEXT: v_readlane_b32 s31, v41, 2
; GFX11-NEXT: v_readlane_b32 s4, v41, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v41, 3
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c06011c259f9b..0b54bbd7e2105 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -34,8 +34,8 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v1, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
+; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -59,8 +59,8 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
+; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -85,8 +85,8 @@ define amdgpu_gfx void @call_i1() #0 {
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
+; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -136,8 +136,8 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v1, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
+; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -161,8 +161,8 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
+; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -187,8 +187,8 @@ define amdgpu_gfx void @call_i16() #0 {
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
+; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -232,8 +232,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v1, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: v_readlane_b32 s30, v1, 0
+; GFX9-NEXT: v_readlane_b32 s31, v1, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -257,8 +257,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v1, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: v_readlane_b32 s30, v1, 0
+; GFX10-NEXT: v_readlane_b32 s31, v1, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -283,8 +283,8 @@ define amdgpu_gfx void @call_2xi16() #0 {
; GFX11-NEXT: v_writelane_b32 v1, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: v_readlane_b32 s30, v1, 0
+; GFX11-NEXT: v_readlane_b32 s31, v1, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
@@ -336,8 +336,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -361,8 +361,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
+; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -387,8 +387,8 @@ define amdgpu_gfx void @call_3xi16() #0 {
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
@@ -750,8 +750,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v100, 1
; GFX9-NEXT: v_readlane_b32 s30, v100, 0
+; GFX9-NEXT: v_readlane_b32 s31, v100, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -840,8 +840,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:116
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:120
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:124
-; GFX10-NEXT: v_readlane_b32 s31, v100, 1
; GFX10-NEXT: v_readlane_b32 s30, v100, 0
+; GFX10-NEXT: v_readlane_b32 s31, v100, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -931,8 +931,8 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:116
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:120
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:124
-; GFX11-NEXT: v_readlane_b32 s31, v100, 1
; GFX11-NEXT: v_readlane_b32 s30, v100, 0
+; GFX11-NEXT: v_readlane_b32 s31, v100, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload
@@ -2151,8 +2151,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX9-NEXT: s_add_i32 s32, s32, 0x60000
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s38
; GFX9-NEXT: s_xor_saveexec_b64 s[36:37], -1
@@ -2181,8 +2181,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX10-NEXT: s_add_i32 s32, s32, 0x30000
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
+; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: s_mov_b32 s32, s34
; GFX10-NEXT: s_mov_b32 s34, s38
; GFX10-NEXT: s_xor_saveexec_b32 s36, -1
@@ -2213,8 +2213,8 @@ define amdgpu_gfx void @call_512xi32() #0 {
; GFX11-NEXT: v_writelane_b32 v5, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v5, 1
; GFX11-NEXT: v_readlane_b32 s30, v5, 0
+; GFX11-NEXT: v_readlane_b32 s31, v5, 1
; GFX11-NEXT: s_mov_b32 s32, s34
; GFX11-NEXT: s_mov_b32 s34, s36
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
@@ -2889,8 +2889,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
+; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s38
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
@@ -3167,8 +3167,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56
-; GFX10-NEXT: v_readlane_b32 s31, v63, 1
; GFX10-NEXT: v_readlane_b32 s30, v63, 0
+; GFX10-NEXT: v_readlane_b32 s31, v63, 1
; GFX10-NEXT: s_mov_b32 s32, s34
; GFX10-NEXT: s_mov_b32 s34, s38
; GFX10-NEXT: s_or_saveexec_b32 s36, -1
@@ -3347,8 +3347,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:44
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:48
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:52
-; GFX11-NEXT: v_readlane_b32 s31, v62, 1
; GFX11-NEXT: v_readlane_b32 s30, v62, 0
+; GFX11-NEXT: v_readlane_b32 s31, v62, 1
; GFX11-NEXT: s_mov_b32 s32, s34
; GFX11-NEXT: s_mov_b32 s34, s39
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll
index d8df20eb69452..4c7bef4aec091 100644
--- a/llvm/test/CodeGen/AMDGPU/global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll
@@ -35,8 +35,8 @@ define void @bar() {
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 76f204dd0c16a..e1f6906a89c29 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -9,28 +9,30 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-NEXT: v_writelane_b32 v6, s30, 0
-; CHECK-NEXT: v_writelane_b32 v6, s31, 1
-; CHECK-NEXT: v_writelane_b32 v6, s36, 2
-; CHECK-NEXT: v_writelane_b32 v6, s37, 3
-; CHECK-NEXT: v_writelane_b32 v6, s38, 4
-; CHECK-NEXT: v_writelane_b32 v6, s39, 5
-; CHECK-NEXT: v_writelane_b32 v6, s48, 6
-; CHECK-NEXT: v_writelane_b32 v6, s49, 7
-; CHECK-NEXT: v_writelane_b32 v6, s50, 8
-; CHECK-NEXT: v_writelane_b32 v6, s51, 9
-; CHECK-NEXT: v_writelane_b32 v6, s52, 10
-; CHECK-NEXT: v_writelane_b32 v6, s53, 11
-; CHECK-NEXT: v_writelane_b32 v6, s54, 12
-; CHECK-NEXT: v_writelane_b32 v6, s55, 13
-; CHECK-NEXT: v_writelane_b32 v6, s64, 14
-; CHECK-NEXT: v_writelane_b32 v6, s65, 15
-; CHECK-NEXT: v_writelane_b32 v6, s66, 16
-; CHECK-NEXT: v_writelane_b32 v6, s67, 17
-; CHECK-NEXT: v_writelane_b32 v6, s68, 18
+; CHECK-NEXT: v_writelane_b32 v6, s36, 0
+; CHECK-NEXT: v_writelane_b32 v6, s37, 1
+; CHECK-NEXT: v_writelane_b32 v6, s38, 2
+; CHECK-NEXT: v_writelane_b32 v6, s39, 3
+; CHECK-NEXT: v_writelane_b32 v6, s48, 4
+; CHECK-NEXT: v_writelane_b32 v6, s49, 5
+; CHECK-NEXT: v_writelane_b32 v6, s50, 6
+; CHECK-NEXT: v_writelane_b32 v6, s51, 7
+; CHECK-NEXT: v_writelane_b32 v6, s52, 8
+; CHECK-NEXT: v_writelane_b32 v6, s53, 9
+; CHECK-NEXT: v_writelane_b32 v6, s54, 10
+; CHECK-NEXT: v_writelane_b32 v6, s55, 11
+; CHECK-NEXT: v_writelane_b32 v6, s64, 12
+; CHECK-NEXT: v_writelane_b32 v6, s65, 13
+; CHECK-NEXT: v_writelane_b32 v6, s66, 14
+; CHECK-NEXT: v_writelane_b32 v6, s67, 15
+; CHECK-NEXT: v_writelane_b32 v6, s68, 16
+; CHECK-NEXT: v_writelane_b32 v6, s69, 17
+; CHECK-NEXT: v_writelane_b32 v6, s70, 18
+; CHECK-NEXT: v_writelane_b32 v6, s71, 19
+; CHECK-NEXT: v_writelane_b32 v6, s30, 20
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: v_writelane_b32 v6, s69, 19
+; CHECK-NEXT: v_writelane_b32 v6, s31, 21
; CHECK-NEXT: s_mov_b32 s68, 0
; CHECK-NEXT: s_mov_b32 s69, s4
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
@@ -40,11 +42,11 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130
; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
-; CHECK-NEXT: v_writelane_b32 v6, s70, 20
-; CHECK-NEXT: v_writelane_b32 v6, s71, 21
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0
+; CHECK-NEXT: s_mov_b32 s70, s68
; CHECK-NEXT: v_writelane_b32 v7, s8, 0
; CHECK-NEXT: v_writelane_b32 v7, s9, 1
; CHECK-NEXT: v_writelane_b32 v7, s10, 2
@@ -77,9 +79,7 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v7, s65, 29
; CHECK-NEXT: v_writelane_b32 v7, s66, 30
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0
-; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0
; CHECK-NEXT: s_mov_b32 s69, s68
-; CHECK-NEXT: s_mov_b32 s70, s68
; CHECK-NEXT: s_mov_b32 s71, s68
; CHECK-NEXT: v_writelane_b32 v7, s67, 31
; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
@@ -225,29 +225,29 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT: v_readlane_b32 s71, v6, 21
-; CHECK-NEXT: v_readlane_b32 s70, v6, 20
-; CHECK-NEXT: v_readlane_b32 s69, v6, 19
-; CHECK-NEXT: v_readlane_b32 s68, v6, 18
-; CHECK-NEXT: v_readlane_b32 s67, v6, 17
-; CHECK-NEXT: v_readlane_b32 s66, v6, 16
-; CHECK-NEXT: v_readlane_b32 s65, v6, 15
-; CHECK-NEXT: v_readlane_b32 s64, v6, 14
-; CHECK-NEXT: v_readlane_b32 s55, v6, 13
-; CHECK-NEXT: v_readlane_b32 s54, v6, 12
-; CHECK-NEXT: v_readlane_b32 s53, v6, 11
-; CHECK-NEXT: v_readlane_b32 s52, v6, 10
+; CHECK-NEXT: v_readlane_b32 s30, v6, 20
+; CHECK-NEXT: v_readlane_b32 s31, v6, 21
+; CHECK-NEXT: v_readlane_b32 s71, v6, 19
+; CHECK-NEXT: v_readlane_b32 s70, v6, 18
+; CHECK-NEXT: v_readlane_b32 s69, v6, 17
+; CHECK-NEXT: v_readlane_b32 s68, v6, 16
+; CHECK-NEXT: v_readlane_b32 s67, v6, 15
+; CHECK-NEXT: v_readlane_b32 s66, v6, 14
+; CHECK-NEXT: v_readlane_b32 s65, v6, 13
+; CHECK-NEXT: v_readlane_b32 s64, v6, 12
+; CHECK-NEXT: v_readlane_b32 s55, v6, 11
+; CHECK-NEXT: v_readlane_b32 s54, v6, 10
+; CHECK-NEXT: v_readlane_b32 s53, v6, 9
+; CHECK-NEXT: v_readlane_b32 s52, v6, 8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_readlane_b32 s51, v6, 9
-; CHECK-NEXT: v_readlane_b32 s50, v6, 8
-; CHECK-NEXT: v_readlane_b32 s49, v6, 7
-; CHECK-NEXT: v_readlane_b32 s48, v6, 6
-; CHECK-NEXT: v_readlane_b32 s39, v6, 5
-; CHECK-NEXT: v_readlane_b32 s38, v6, 4
-; CHECK-NEXT: v_readlane_b32 s37, v6, 3
-; CHECK-NEXT: v_readlane_b32 s36, v6, 2
-; CHECK-NEXT: v_readlane_b32 s31, v6, 1
-; CHECK-NEXT: v_readlane_b32 s30, v6, 0
+; CHECK-NEXT: v_readlane_b32 s51, v6, 7
+; CHECK-NEXT: v_readlane_b32 s50, v6, 6
+; CHECK-NEXT: v_readlane_b32 s49, v6, 5
+; CHECK-NEXT: v_readlane_b32 s48, v6, 4
+; CHECK-NEXT: v_readlane_b32 s39, v6, 3
+; CHECK-NEXT: v_readlane_b32 s38, v6, 2
+; CHECK-NEXT: v_readlane_b32 s37, v6, 1
+; CHECK-NEXT: v_readlane_b32 s36, v6, 0
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -336,7 +336,7 @@ declare <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 immarg, float,
declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
-attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) }
attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index a208cfdb197af..2aaaff1ecc407 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -128,24 +128,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
-; GCN-NEXT: v_writelane_b32 v40, s64, 16
-; GCN-NEXT: v_writelane_b32 v40, s65, 17
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s64, 14
+; GCN-NEXT: v_writelane_b32 v40, s65, 15
+; GCN-NEXT: v_writelane_b32 v40, s30, 16
+; GCN-NEXT: v_writelane_b32 v40, s31, 17
; GCN-NEXT: s_mov_b32 s50, s15
; GCN-NEXT: s_mov_b32 s51, s14
; GCN-NEXT: s_mov_b32 s52, s13
@@ -175,24 +175,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: s_cbranch_execnz .LBB2_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[54:55]
-; GCN-NEXT: v_readlane_b32 s65, v40, 17
-; GCN-NEXT: v_readlane_b32 s64, v40, 16
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 16
+; GCN-NEXT: v_readlane_b32 s31, v40, 17
+; GCN-NEXT: v_readlane_b32 s65, v40, 15
+; GCN-NEXT: v_readlane_b32 s64, v40, 14
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -212,24 +212,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
-; GISEL-NEXT: v_writelane_b32 v40, s64, 16
-; GISEL-NEXT: v_writelane_b32 v40, s65, 17
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s64, 14
+; GISEL-NEXT: v_writelane_b32 v40, s65, 15
+; GISEL-NEXT: v_writelane_b32 v40, s30, 16
+; GISEL-NEXT: v_writelane_b32 v40, s31, 17
; GISEL-NEXT: s_mov_b32 s50, s15
; GISEL-NEXT: s_mov_b32 s51, s14
; GISEL-NEXT: s_mov_b32 s52, s13
@@ -259,24 +259,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: s_cbranch_execnz .LBB2_1
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[54:55]
-; GISEL-NEXT: v_readlane_b32 s65, v40, 17
-; GISEL-NEXT: v_readlane_b32 s64, v40, 16
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 16
+; GISEL-NEXT: v_readlane_b32 s31, v40, 17
+; GISEL-NEXT: v_readlane_b32 s65, v40, 15
+; GISEL-NEXT: v_readlane_b32 s64, v40, 14
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -300,24 +300,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
-; GCN-NEXT: v_writelane_b32 v40, s64, 16
-; GCN-NEXT: v_writelane_b32 v40, s65, 17
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s64, 14
+; GCN-NEXT: v_writelane_b32 v40, s65, 15
+; GCN-NEXT: v_writelane_b32 v40, s30, 16
+; GCN-NEXT: v_writelane_b32 v40, s31, 17
; GCN-NEXT: s_mov_b32 s50, s15
; GCN-NEXT: s_mov_b32 s51, s14
; GCN-NEXT: s_mov_b32 s52, s13
@@ -350,24 +350,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GCN-NEXT: s_cbranch_execnz .LBB3_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[54:55]
-; GCN-NEXT: v_readlane_b32 s65, v40, 17
-; GCN-NEXT: v_readlane_b32 s64, v40, 16
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 16
+; GCN-NEXT: v_readlane_b32 s31, v40, 17
+; GCN-NEXT: v_readlane_b32 s65, v40, 15
+; GCN-NEXT: v_readlane_b32 s64, v40, 14
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -387,24 +387,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
-; GISEL-NEXT: v_writelane_b32 v40, s64, 16
-; GISEL-NEXT: v_writelane_b32 v40, s65, 17
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s64, 14
+; GISEL-NEXT: v_writelane_b32 v40, s65, 15
+; GISEL-NEXT: v_writelane_b32 v40, s30, 16
+; GISEL-NEXT: v_writelane_b32 v40, s31, 17
; GISEL-NEXT: s_mov_b32 s50, s15
; GISEL-NEXT: s_mov_b32 s51, s14
; GISEL-NEXT: s_mov_b32 s52, s13
@@ -435,24 +435,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GISEL-NEXT: s_cbranch_execnz .LBB3_1
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[54:55]
-; GISEL-NEXT: v_readlane_b32 s65, v40, 17
-; GISEL-NEXT: v_readlane_b32 s64, v40, 16
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 16
+; GISEL-NEXT: v_readlane_b32 s31, v40, 17
+; GISEL-NEXT: v_readlane_b32 s65, v40, 15
+; GISEL-NEXT: v_readlane_b32 s64, v40, 14
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -476,24 +476,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
-; GCN-NEXT: v_writelane_b32 v40, s64, 16
-; GCN-NEXT: v_writelane_b32 v40, s65, 17
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s64, 14
+; GCN-NEXT: v_writelane_b32 v40, s65, 15
+; GCN-NEXT: v_writelane_b32 v40, s30, 16
+; GCN-NEXT: v_writelane_b32 v40, s31, 17
; GCN-NEXT: s_mov_b32 s50, s15
; GCN-NEXT: s_mov_b32 s51, s14
; GCN-NEXT: s_mov_b32 s52, s13
@@ -525,24 +525,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[54:55]
; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2
-; GCN-NEXT: v_readlane_b32 s65, v40, 17
-; GCN-NEXT: v_readlane_b32 s64, v40, 16
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 16
+; GCN-NEXT: v_readlane_b32 s31, v40, 17
+; GCN-NEXT: v_readlane_b32 s65, v40, 15
+; GCN-NEXT: v_readlane_b32 s64, v40, 14
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -562,24 +562,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
-; GISEL-NEXT: v_writelane_b32 v40, s64, 16
-; GISEL-NEXT: v_writelane_b32 v40, s65, 17
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s64, 14
+; GISEL-NEXT: v_writelane_b32 v40, s65, 15
+; GISEL-NEXT: v_writelane_b32 v40, s30, 16
+; GISEL-NEXT: v_writelane_b32 v40, s31, 17
; GISEL-NEXT: s_mov_b32 s50, s15
; GISEL-NEXT: s_mov_b32 s51, s14
; GISEL-NEXT: s_mov_b32 s52, s13
@@ -611,24 +611,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[54:55]
; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v1
-; GISEL-NEXT: v_readlane_b32 s65, v40, 17
-; GISEL-NEXT: v_readlane_b32 s64, v40, 16
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 16
+; GISEL-NEXT: v_readlane_b32 s31, v40, 17
+; GISEL-NEXT: v_readlane_b32 s65, v40, 15
+; GISEL-NEXT: v_readlane_b32 s64, v40, 14
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -653,26 +653,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 20
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
-; GCN-NEXT: v_writelane_b32 v40, s64, 16
-; GCN-NEXT: v_writelane_b32 v40, s65, 17
-; GCN-NEXT: v_writelane_b32 v40, s66, 18
-; GCN-NEXT: v_writelane_b32 v40, s67, 19
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s64, 14
+; GCN-NEXT: v_writelane_b32 v40, s65, 15
+; GCN-NEXT: v_writelane_b32 v40, s66, 16
+; GCN-NEXT: v_writelane_b32 v40, s67, 17
+; GCN-NEXT: v_writelane_b32 v40, s30, 18
+; GCN-NEXT: v_writelane_b32 v40, s31, 19
; GCN-NEXT: s_mov_b32 s50, s15
; GCN-NEXT: s_mov_b32 s51, s14
; GCN-NEXT: s_mov_b32 s52, s13
@@ -709,26 +709,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: s_mov_b64 exec, s[64:65]
; GCN-NEXT: .LBB5_4: ; %bb2
; GCN-NEXT: s_or_b64 exec, exec, s[54:55]
-; GCN-NEXT: v_readlane_b32 s67, v40, 19
-; GCN-NEXT: v_readlane_b32 s66, v40, 18
-; GCN-NEXT: v_readlane_b32 s65, v40, 17
-; GCN-NEXT: v_readlane_b32 s64, v40, 16
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 18
+; GCN-NEXT: v_readlane_b32 s31, v40, 19
+; GCN-NEXT: v_readlane_b32 s67, v40, 17
+; GCN-NEXT: v_readlane_b32 s66, v40, 16
+; GCN-NEXT: v_readlane_b32 s65, v40, 15
+; GCN-NEXT: v_readlane_b32 s64, v40, 14
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 20
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -748,26 +748,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 20
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
-; GISEL-NEXT: v_writelane_b32 v40, s64, 16
-; GISEL-NEXT: v_writelane_b32 v40, s65, 17
-; GISEL-NEXT: v_writelane_b32 v40, s66, 18
-; GISEL-NEXT: v_writelane_b32 v40, s67, 19
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s64, 14
+; GISEL-NEXT: v_writelane_b32 v40, s65, 15
+; GISEL-NEXT: v_writelane_b32 v40, s66, 16
+; GISEL-NEXT: v_writelane_b32 v40, s67, 17
+; GISEL-NEXT: v_writelane_b32 v40, s30, 18
+; GISEL-NEXT: v_writelane_b32 v40, s31, 19
; GISEL-NEXT: s_mov_b32 s50, s15
; GISEL-NEXT: s_mov_b32 s51, s14
; GISEL-NEXT: s_mov_b32 s52, s13
@@ -804,26 +804,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: s_mov_b64 exec, s[64:65]
; GISEL-NEXT: .LBB5_4: ; %bb2
; GISEL-NEXT: s_or_b64 exec, exec, s[54:55]
-; GISEL-NEXT: v_readlane_b32 s67, v40, 19
-; GISEL-NEXT: v_readlane_b32 s66, v40, 18
-; GISEL-NEXT: v_readlane_b32 s65, v40, 17
-; GISEL-NEXT: v_readlane_b32 s64, v40, 16
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 18
+; GISEL-NEXT: v_readlane_b32 s31, v40, 19
+; GISEL-NEXT: v_readlane_b32 s67, v40, 17
+; GISEL-NEXT: v_readlane_b32 s66, v40, 16
+; GISEL-NEXT: v_readlane_b32 s65, v40, 15
+; GISEL-NEXT: v_readlane_b32 s64, v40, 14
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s4, v40, 20
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -853,22 +853,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s30, 14
+; GCN-NEXT: v_writelane_b32 v40, s31, 15
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
@@ -882,22 +882,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GCN-NEXT: s_cbranch_execnz .LBB6_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 14
+; GCN-NEXT: v_readlane_b32 s31, v40, 15
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -915,22 +915,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s30, 14
+; GISEL-NEXT: v_writelane_b32 v40, s31, 15
; GISEL-NEXT: s_mov_b64 s[6:7], exec
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
@@ -944,22 +944,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GISEL-NEXT: s_cbranch_execnz .LBB6_1
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 14
+; GISEL-NEXT: v_readlane_b32 s31, v40, 15
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -982,22 +982,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v41, s30, 0
-; GCN-NEXT: v_writelane_b32 v41, s31, 1
-; GCN-NEXT: v_writelane_b32 v41, s34, 2
-; GCN-NEXT: v_writelane_b32 v41, s35, 3
-; GCN-NEXT: v_writelane_b32 v41, s36, 4
-; GCN-NEXT: v_writelane_b32 v41, s37, 5
-; GCN-NEXT: v_writelane_b32 v41, s38, 6
-; GCN-NEXT: v_writelane_b32 v41, s39, 7
-; GCN-NEXT: v_writelane_b32 v41, s48, 8
-; GCN-NEXT: v_writelane_b32 v41, s49, 9
-; GCN-NEXT: v_writelane_b32 v41, s50, 10
-; GCN-NEXT: v_writelane_b32 v41, s51, 11
-; GCN-NEXT: v_writelane_b32 v41, s52, 12
-; GCN-NEXT: v_writelane_b32 v41, s53, 13
-; GCN-NEXT: v_writelane_b32 v41, s54, 14
-; GCN-NEXT: v_writelane_b32 v41, s55, 15
+; GCN-NEXT: v_writelane_b32 v41, s34, 0
+; GCN-NEXT: v_writelane_b32 v41, s35, 1
+; GCN-NEXT: v_writelane_b32 v41, s36, 2
+; GCN-NEXT: v_writelane_b32 v41, s37, 3
+; GCN-NEXT: v_writelane_b32 v41, s38, 4
+; GCN-NEXT: v_writelane_b32 v41, s39, 5
+; GCN-NEXT: v_writelane_b32 v41, s48, 6
+; GCN-NEXT: v_writelane_b32 v41, s49, 7
+; GCN-NEXT: v_writelane_b32 v41, s50, 8
+; GCN-NEXT: v_writelane_b32 v41, s51, 9
+; GCN-NEXT: v_writelane_b32 v41, s52, 10
+; GCN-NEXT: v_writelane_b32 v41, s53, 11
+; GCN-NEXT: v_writelane_b32 v41, s54, 12
+; GCN-NEXT: v_writelane_b32 v41, s55, 13
+; GCN-NEXT: v_writelane_b32 v41, s30, 14
+; GCN-NEXT: v_writelane_b32 v41, s31, 15
; GCN-NEXT: v_mov_b32_e32 v40, v0
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1013,22 +1013,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v40
-; GCN-NEXT: v_readlane_b32 s55, v41, 15
-; GCN-NEXT: v_readlane_b32 s54, v41, 14
-; GCN-NEXT: v_readlane_b32 s53, v41, 13
-; GCN-NEXT: v_readlane_b32 s52, v41, 12
-; GCN-NEXT: v_readlane_b32 s51, v41, 11
-; GCN-NEXT: v_readlane_b32 s50, v41, 10
-; GCN-NEXT: v_readlane_b32 s49, v41, 9
-; GCN-NEXT: v_readlane_b32 s48, v41, 8
-; GCN-NEXT: v_readlane_b32 s39, v41, 7
-; GCN-NEXT: v_readlane_b32 s38, v41, 6
-; GCN-NEXT: v_readlane_b32 s37, v41, 5
-; GCN-NEXT: v_readlane_b32 s36, v41, 4
-; GCN-NEXT: v_readlane_b32 s35, v41, 3
-; GCN-NEXT: v_readlane_b32 s34, v41, 2
-; GCN-NEXT: v_readlane_b32 s31, v41, 1
-; GCN-NEXT: v_readlane_b32 s30, v41, 0
+; GCN-NEXT: v_readlane_b32 s30, v41, 14
+; GCN-NEXT: v_readlane_b32 s31, v41, 15
+; GCN-NEXT: v_readlane_b32 s55, v41, 13
+; GCN-NEXT: v_readlane_b32 s54, v41, 12
+; GCN-NEXT: v_readlane_b32 s53, v41, 11
+; GCN-NEXT: v_readlane_b32 s52, v41, 10
+; GCN-NEXT: v_readlane_b32 s51, v41, 9
+; GCN-NEXT: v_readlane_b32 s50, v41, 8
+; GCN-NEXT: v_readlane_b32 s49, v41, 7
+; GCN-NEXT: v_readlane_b32 s48, v41, 6
+; GCN-NEXT: v_readlane_b32 s39, v41, 5
+; GCN-NEXT: v_readlane_b32 s38, v41, 4
+; GCN-NEXT: v_readlane_b32 s37, v41, 3
+; GCN-NEXT: v_readlane_b32 s36, v41, 2
+; GCN-NEXT: v_readlane_b32 s35, v41, 1
+; GCN-NEXT: v_readlane_b32 s34, v41, 0
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
@@ -1048,22 +1048,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT: v_writelane_b32 v41, s30, 0
-; GISEL-NEXT: v_writelane_b32 v41, s31, 1
-; GISEL-NEXT: v_writelane_b32 v41, s34, 2
-; GISEL-NEXT: v_writelane_b32 v41, s35, 3
-; GISEL-NEXT: v_writelane_b32 v41, s36, 4
-; GISEL-NEXT: v_writelane_b32 v41, s37, 5
-; GISEL-NEXT: v_writelane_b32 v41, s38, 6
-; GISEL-NEXT: v_writelane_b32 v41, s39, 7
-; GISEL-NEXT: v_writelane_b32 v41, s48, 8
-; GISEL-NEXT: v_writelane_b32 v41, s49, 9
-; GISEL-NEXT: v_writelane_b32 v41, s50, 10
-; GISEL-NEXT: v_writelane_b32 v41, s51, 11
-; GISEL-NEXT: v_writelane_b32 v41, s52, 12
-; GISEL-NEXT: v_writelane_b32 v41, s53, 13
-; GISEL-NEXT: v_writelane_b32 v41, s54, 14
-; GISEL-NEXT: v_writelane_b32 v41, s55, 15
+; GISEL-NEXT: v_writelane_b32 v41, s34, 0
+; GISEL-NEXT: v_writelane_b32 v41, s35, 1
+; GISEL-NEXT: v_writelane_b32 v41, s36, 2
+; GISEL-NEXT: v_writelane_b32 v41, s37, 3
+; GISEL-NEXT: v_writelane_b32 v41, s38, 4
+; GISEL-NEXT: v_writelane_b32 v41, s39, 5
+; GISEL-NEXT: v_writelane_b32 v41, s48, 6
+; GISEL-NEXT: v_writelane_b32 v41, s49, 7
+; GISEL-NEXT: v_writelane_b32 v41, s50, 8
+; GISEL-NEXT: v_writelane_b32 v41, s51, 9
+; GISEL-NEXT: v_writelane_b32 v41, s52, 10
+; GISEL-NEXT: v_writelane_b32 v41, s53, 11
+; GISEL-NEXT: v_writelane_b32 v41, s54, 12
+; GISEL-NEXT: v_writelane_b32 v41, s55, 13
+; GISEL-NEXT: v_writelane_b32 v41, s30, 14
+; GISEL-NEXT: v_writelane_b32 v41, s31, 15
; GISEL-NEXT: v_mov_b32_e32 v40, v0
; GISEL-NEXT: s_mov_b64 s[4:5], exec
; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1079,22 +1079,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: v_mov_b32_e32 v0, v40
-; GISEL-NEXT: v_readlane_b32 s55, v41, 15
-; GISEL-NEXT: v_readlane_b32 s54, v41, 14
-; GISEL-NEXT: v_readlane_b32 s53, v41, 13
-; GISEL-NEXT: v_readlane_b32 s52, v41, 12
-; GISEL-NEXT: v_readlane_b32 s51, v41, 11
-; GISEL-NEXT: v_readlane_b32 s50, v41, 10
-; GISEL-NEXT: v_readlane_b32 s49, v41, 9
-; GISEL-NEXT: v_readlane_b32 s48, v41, 8
-; GISEL-NEXT: v_readlane_b32 s39, v41, 7
-; GISEL-NEXT: v_readlane_b32 s38, v41, 6
-; GISEL-NEXT: v_readlane_b32 s37, v41, 5
-; GISEL-NEXT: v_readlane_b32 s36, v41, 4
-; GISEL-NEXT: v_readlane_b32 s35, v41, 3
-; GISEL-NEXT: v_readlane_b32 s34, v41, 2
-; GISEL-NEXT: v_readlane_b32 s31, v41, 1
-; GISEL-NEXT: v_readlane_b32 s30, v41, 0
+; GISEL-NEXT: v_readlane_b32 s30, v41, 14
+; GISEL-NEXT: v_readlane_b32 s31, v41, 15
+; GISEL-NEXT: v_readlane_b32 s55, v41, 13
+; GISEL-NEXT: v_readlane_b32 s54, v41, 12
+; GISEL-NEXT: v_readlane_b32 s53, v41, 11
+; GISEL-NEXT: v_readlane_b32 s52, v41, 10
+; GISEL-NEXT: v_readlane_b32 s51, v41, 9
+; GISEL-NEXT: v_readlane_b32 s50, v41, 8
+; GISEL-NEXT: v_readlane_b32 s49, v41, 7
+; GISEL-NEXT: v_readlane_b32 s48, v41, 6
+; GISEL-NEXT: v_readlane_b32 s39, v41, 5
+; GISEL-NEXT: v_readlane_b32 s38, v41, 4
+; GISEL-NEXT: v_readlane_b32 s37, v41, 3
+; GISEL-NEXT: v_readlane_b32 s36, v41, 2
+; GISEL-NEXT: v_readlane_b32 s35, v41, 1
+; GISEL-NEXT: v_readlane_b32 s34, v41, 0
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
@@ -1121,22 +1121,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s30, 14
+; GCN-NEXT: v_writelane_b32 v40, s31, 15
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v1
@@ -1152,22 +1152,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v3
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 14
+; GCN-NEXT: v_readlane_b32 s31, v40, 15
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1185,22 +1185,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s30, 14
+; GISEL-NEXT: v_writelane_b32 v40, s31, 15
; GISEL-NEXT: s_mov_b64 s[4:5], exec
; GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s8, v1
@@ -1216,22 +1216,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: v_mov_b32_e32 v0, v2
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 14
+; GISEL-NEXT: v_readlane_b32 s31, v40, 15
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1254,22 +1254,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
-; GCN-NEXT: v_writelane_b32 v40, s36, 4
-; GCN-NEXT: v_writelane_b32 v40, s37, 5
-; GCN-NEXT: v_writelane_b32 v40, s38, 6
-; GCN-NEXT: v_writelane_b32 v40, s39, 7
-; GCN-NEXT: v_writelane_b32 v40, s48, 8
-; GCN-NEXT: v_writelane_b32 v40, s49, 9
-; GCN-NEXT: v_writelane_b32 v40, s50, 10
-; GCN-NEXT: v_writelane_b32 v40, s51, 11
-; GCN-NEXT: v_writelane_b32 v40, s52, 12
-; GCN-NEXT: v_writelane_b32 v40, s53, 13
-; GCN-NEXT: v_writelane_b32 v40, s54, 14
-; GCN-NEXT: v_writelane_b32 v40, s55, 15
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
+; GCN-NEXT: v_writelane_b32 v40, s36, 2
+; GCN-NEXT: v_writelane_b32 v40, s37, 3
+; GCN-NEXT: v_writelane_b32 v40, s38, 4
+; GCN-NEXT: v_writelane_b32 v40, s39, 5
+; GCN-NEXT: v_writelane_b32 v40, s48, 6
+; GCN-NEXT: v_writelane_b32 v40, s49, 7
+; GCN-NEXT: v_writelane_b32 v40, s50, 8
+; GCN-NEXT: v_writelane_b32 v40, s51, 9
+; GCN-NEXT: v_writelane_b32 v40, s52, 10
+; GCN-NEXT: v_writelane_b32 v40, s53, 11
+; GCN-NEXT: v_writelane_b32 v40, s54, 12
+; GCN-NEXT: v_writelane_b32 v40, s55, 13
+; GCN-NEXT: v_writelane_b32 v40, s30, 14
+; GCN-NEXT: v_writelane_b32 v40, s31, 15
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s6, v0
@@ -1282,22 +1282,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: s_cbranch_execnz .LBB9_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_readlane_b32 s55, v40, 15
-; GCN-NEXT: v_readlane_b32 s54, v40, 14
-; GCN-NEXT: v_readlane_b32 s53, v40, 13
-; GCN-NEXT: v_readlane_b32 s52, v40, 12
-; GCN-NEXT: v_readlane_b32 s51, v40, 11
-; GCN-NEXT: v_readlane_b32 s50, v40, 10
-; GCN-NEXT: v_readlane_b32 s49, v40, 9
-; GCN-NEXT: v_readlane_b32 s48, v40, 8
-; GCN-NEXT: v_readlane_b32 s39, v40, 7
-; GCN-NEXT: v_readlane_b32 s38, v40, 6
-; GCN-NEXT: v_readlane_b32 s37, v40, 5
-; GCN-NEXT: v_readlane_b32 s36, v40, 4
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s30, v40, 14
+; GCN-NEXT: v_readlane_b32 s31, v40, 15
+; GCN-NEXT: v_readlane_b32 s55, v40, 13
+; GCN-NEXT: v_readlane_b32 s54, v40, 12
+; GCN-NEXT: v_readlane_b32 s53, v40, 11
+; GCN-NEXT: v_readlane_b32 s52, v40, 10
+; GCN-NEXT: v_readlane_b32 s51, v40, 9
+; GCN-NEXT: v_readlane_b32 s50, v40, 8
+; GCN-NEXT: v_readlane_b32 s49, v40, 7
+; GCN-NEXT: v_readlane_b32 s48, v40, 6
+; GCN-NEXT: v_readlane_b32 s39, v40, 5
+; GCN-NEXT: v_readlane_b32 s38, v40, 4
+; GCN-NEXT: v_readlane_b32 s37, v40, 3
+; GCN-NEXT: v_readlane_b32 s36, v40, 2
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -1315,22 +1315,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0x400
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: v_writelane_b32 v40, s34, 2
-; GISEL-NEXT: v_writelane_b32 v40, s35, 3
-; GISEL-NEXT: v_writelane_b32 v40, s36, 4
-; GISEL-NEXT: v_writelane_b32 v40, s37, 5
-; GISEL-NEXT: v_writelane_b32 v40, s38, 6
-; GISEL-NEXT: v_writelane_b32 v40, s39, 7
-; GISEL-NEXT: v_writelane_b32 v40, s48, 8
-; GISEL-NEXT: v_writelane_b32 v40, s49, 9
-; GISEL-NEXT: v_writelane_b32 v40, s50, 10
-; GISEL-NEXT: v_writelane_b32 v40, s51, 11
-; GISEL-NEXT: v_writelane_b32 v40, s52, 12
-; GISEL-NEXT: v_writelane_b32 v40, s53, 13
-; GISEL-NEXT: v_writelane_b32 v40, s54, 14
-; GISEL-NEXT: v_writelane_b32 v40, s55, 15
+; GISEL-NEXT: v_writelane_b32 v40, s34, 0
+; GISEL-NEXT: v_writelane_b32 v40, s35, 1
+; GISEL-NEXT: v_writelane_b32 v40, s36, 2
+; GISEL-NEXT: v_writelane_b32 v40, s37, 3
+; GISEL-NEXT: v_writelane_b32 v40, s38, 4
+; GISEL-NEXT: v_writelane_b32 v40, s39, 5
+; GISEL-NEXT: v_writelane_b32 v40, s48, 6
+; GISEL-NEXT: v_writelane_b32 v40, s49, 7
+; GISEL-NEXT: v_writelane_b32 v40, s50, 8
+; GISEL-NEXT: v_writelane_b32 v40, s51, 9
+; GISEL-NEXT: v_writelane_b32 v40, s52, 10
+; GISEL-NEXT: v_writelane_b32 v40, s53, 11
+; GISEL-NEXT: v_writelane_b32 v40, s54, 12
+; GISEL-NEXT: v_writelane_b32 v40, s55, 13
+; GISEL-NEXT: v_writelane_b32 v40, s30, 14
+; GISEL-NEXT: v_writelane_b32 v40, s31, 15
; GISEL-NEXT: s_mov_b64 s[4:5], exec
; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s6, v0
@@ -1343,22 +1343,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: s_cbranch_execnz .LBB9_1
; GISEL-NEXT: ; %bb.2:
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; GISEL-NEXT: v_readlane_b32 s55, v40, 15
-; GISEL-NEXT: v_readlane_b32 s54, v40, 14
-; GISEL-NEXT: v_readlane_b32 s53, v40, 13
-; GISEL-NEXT: v_readlane_b32 s52, v40, 12
-; GISEL-NEXT: v_readlane_b32 s51, v40, 11
-; GISEL-NEXT: v_readlane_b32 s50, v40, 10
-; GISEL-NEXT: v_readlane_b32 s49, v40, 9
-; GISEL-NEXT: v_readlane_b32 s48, v40, 8
-; GISEL-NEXT: v_readlane_b32 s39, v40, 7
-; GISEL-NEXT: v_readlane_b32 s38, v40, 6
-; GISEL-NEXT: v_readlane_b32 s37, v40, 5
-; GISEL-NEXT: v_readlane_b32 s36, v40, 4
-; GISEL-NEXT: v_readlane_b32 s35, v40, 3
-; GISEL-NEXT: v_readlane_b32 s34, v40, 2
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s30, v40, 14
+; GISEL-NEXT: v_readlane_b32 s31, v40, 15
+; GISEL-NEXT: v_readlane_b32 s55, v40, 13
+; GISEL-NEXT: v_readlane_b32 s54, v40, 12
+; GISEL-NEXT: v_readlane_b32 s53, v40, 11
+; GISEL-NEXT: v_readlane_b32 s52, v40, 10
+; GISEL-NEXT: v_readlane_b32 s51, v40, 9
+; GISEL-NEXT: v_readlane_b32 s50, v40, 8
+; GISEL-NEXT: v_readlane_b32 s49, v40, 7
+; GISEL-NEXT: v_readlane_b32 s48, v40, 6
+; GISEL-NEXT: v_readlane_b32 s39, v40, 5
+; GISEL-NEXT: v_readlane_b32 s38, v40, 4
+; GISEL-NEXT: v_readlane_b32 s37, v40, 3
+; GISEL-NEXT: v_readlane_b32 s36, v40, 2
+; GISEL-NEXT: v_readlane_b32 s35, v40, 1
+; GISEL-NEXT: v_readlane_b32 s34, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index c3f391786f878..fcc43ffd0140e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -35,8 +35,8 @@ define void @f0() {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v4, 1
; GFX11-NEXT: v_readlane_b32 s30, v4, 0
+; GFX11-NEXT: v_readlane_b32 s31, v4, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index 358a3fe65ad0f..1d83ec582451a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -7,13 +7,13 @@ define fastcc i32 @foo() #0 {
; CHECK-LABEL: name: foo
; CHECK: bb.0 (%ir-block.0):
; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+ ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAITCNT 0
; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33
; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32
; CHECK-NEXT: $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.2, addrspace 5)
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.1, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17
; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
@@ -26,8 +26,8 @@ define fastcc i32 @foo() #0 {
; CHECK-NEXT: BUFFER_GL1_INV implicit $exec
; CHECK-NEXT: BUFFER_GL0_INV implicit $exec
; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
- ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
- ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
+ ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
; CHECK-NEXT: S_WAITCNT 49279
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo
@@ -39,12 +39,12 @@ define fastcc i32 @foo() #0 {
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.DummyReturnBlock:
+ ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0, implicit-def $sgpr30_sgpr31
; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1
- ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0
; CHECK-NEXT: $sgpr32 = S_MOV_B32 $sgpr33
; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2
; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" load (s32) from %stack.2, addrspace 5)
+ ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" load (s32) from %stack.1, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5
; CHECK-NEXT: $sgpr33 = S_MOV_B32 killed $sgpr4
; CHECK-NEXT: S_WAITCNT 16240
diff --git a/llvm/test/CodeGen/AMDGPU/issue176578.ll b/llvm/test/CodeGen/AMDGPU/issue176578.ll
index 08986d1f61efd..22c1307c779ee 100644
--- a/llvm/test/CodeGen/AMDGPU/issue176578.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue176578.ll
@@ -18,22 +18,21 @@ define <4 x i8> @issue176578() #0 {
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v41, s16, 15
-; CHECK-NEXT: v_writelane_b32 v41, s30, 0
-; CHECK-NEXT: v_writelane_b32 v41, s31, 1
-; CHECK-NEXT: v_writelane_b32 v41, s34, 2
-; CHECK-NEXT: v_writelane_b32 v41, s35, 3
-; CHECK-NEXT: v_writelane_b32 v41, s36, 4
-; CHECK-NEXT: v_writelane_b32 v41, s37, 5
-; CHECK-NEXT: v_writelane_b32 v41, s38, 6
-; CHECK-NEXT: v_writelane_b32 v41, s39, 7
-; CHECK-NEXT: v_writelane_b32 v41, s48, 8
-; CHECK-NEXT: v_writelane_b32 v41, s49, 9
-; CHECK-NEXT: v_writelane_b32 v41, s50, 10
-; CHECK-NEXT: v_writelane_b32 v41, s51, 11
-; CHECK-NEXT: v_writelane_b32 v41, s52, 12
-; CHECK-NEXT: v_writelane_b32 v41, s53, 13
+; CHECK-NEXT: v_writelane_b32 v41, s34, 0
+; CHECK-NEXT: v_writelane_b32 v41, s35, 1
+; CHECK-NEXT: v_writelane_b32 v41, s36, 2
+; CHECK-NEXT: v_writelane_b32 v41, s37, 3
+; CHECK-NEXT: v_writelane_b32 v41, s38, 4
+; CHECK-NEXT: v_writelane_b32 v41, s39, 5
+; CHECK-NEXT: v_writelane_b32 v41, s48, 6
+; CHECK-NEXT: v_writelane_b32 v41, s49, 7
+; CHECK-NEXT: v_writelane_b32 v41, s50, 8
+; CHECK-NEXT: v_writelane_b32 v41, s51, 9
+; CHECK-NEXT: v_writelane_b32 v41, s52, 10
+; CHECK-NEXT: v_writelane_b32 v41, s53, 11
+; CHECK-NEXT: v_writelane_b32 v41, s54, 12
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v41, s54, 14
+; CHECK-NEXT: v_writelane_b32 v41, s30, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
@@ -45,6 +44,7 @@ define <4 x i8> @issue176578() #0 {
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_mov_b32 s54, 0
; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v41, s31, 14
; CHECK-NEXT: s_branch .LBB0_2
; CHECK-NEXT: .LBB0_1: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
@@ -81,25 +81,25 @@ define <4 x i8> @issue176578() #0 {
; CHECK-NEXT: s_branch .LBB0_1
; CHECK-NEXT: .LBB0_4: ; %bb4
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s30, v41, 13
; CHECK-NEXT: v_mov_b32_e32 v0, s54
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
-; CHECK-NEXT: v_readlane_b32 s54, v41, 14
-; CHECK-NEXT: v_readlane_b32 s53, v41, 13
-; CHECK-NEXT: v_readlane_b32 s52, v41, 12
-; CHECK-NEXT: v_readlane_b32 s51, v41, 11
-; CHECK-NEXT: v_readlane_b32 s50, v41, 10
-; CHECK-NEXT: v_readlane_b32 s49, v41, 9
-; CHECK-NEXT: v_readlane_b32 s48, v41, 8
-; CHECK-NEXT: v_readlane_b32 s39, v41, 7
-; CHECK-NEXT: v_readlane_b32 s38, v41, 6
-; CHECK-NEXT: v_readlane_b32 s37, v41, 5
-; CHECK-NEXT: v_readlane_b32 s36, v41, 4
-; CHECK-NEXT: v_readlane_b32 s35, v41, 3
-; CHECK-NEXT: v_readlane_b32 s34, v41, 2
-; CHECK-NEXT: v_readlane_b32 s31, v41, 1
-; CHECK-NEXT: v_readlane_b32 s30, v41, 0
+; CHECK-NEXT: v_readlane_b32 s31, v41, 14
+; CHECK-NEXT: v_readlane_b32 s54, v41, 12
+; CHECK-NEXT: v_readlane_b32 s53, v41, 11
+; CHECK-NEXT: v_readlane_b32 s52, v41, 10
+; CHECK-NEXT: v_readlane_b32 s51, v41, 9
+; CHECK-NEXT: v_readlane_b32 s50, v41, 8
+; CHECK-NEXT: v_readlane_b32 s49, v41, 7
+; CHECK-NEXT: v_readlane_b32 s48, v41, 6
+; CHECK-NEXT: v_readlane_b32 s39, v41, 5
+; CHECK-NEXT: v_readlane_b32 s38, v41, 4
+; CHECK-NEXT: v_readlane_b32 s37, v41, 3
+; CHECK-NEXT: v_readlane_b32 s36, v41, 2
+; CHECK-NEXT: v_readlane_b32 s35, v41, 1
+; CHECK-NEXT: v_readlane_b32 s34, v41, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v41, 15
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 481eb1bc3d91a..68c0d78485517 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -37,26 +37,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX7-NEXT: s_add_i32 s6, s32, 0x101100
; GFX7-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v23, s30, 0
-; GFX7-NEXT: v_writelane_b32 v23, s31, 1
-; GFX7-NEXT: v_writelane_b32 v23, s33, 2
-; GFX7-NEXT: v_writelane_b32 v23, s34, 3
-; GFX7-NEXT: v_writelane_b32 v23, s35, 4
-; GFX7-NEXT: v_writelane_b32 v23, s36, 5
-; GFX7-NEXT: v_writelane_b32 v23, s37, 6
-; GFX7-NEXT: v_writelane_b32 v23, s38, 7
-; GFX7-NEXT: v_writelane_b32 v23, s39, 8
-; GFX7-NEXT: v_writelane_b32 v23, s48, 9
-; GFX7-NEXT: v_writelane_b32 v23, s49, 10
-; GFX7-NEXT: v_writelane_b32 v23, s50, 11
-; GFX7-NEXT: v_writelane_b32 v23, s51, 12
-; GFX7-NEXT: v_writelane_b32 v23, s52, 13
-; GFX7-NEXT: v_writelane_b32 v23, s53, 14
+; GFX7-NEXT: v_writelane_b32 v23, s33, 0
+; GFX7-NEXT: v_writelane_b32 v23, s34, 1
+; GFX7-NEXT: v_writelane_b32 v23, s35, 2
+; GFX7-NEXT: v_writelane_b32 v23, s36, 3
+; GFX7-NEXT: v_writelane_b32 v23, s37, 4
+; GFX7-NEXT: v_writelane_b32 v23, s38, 5
+; GFX7-NEXT: v_writelane_b32 v23, s39, 6
+; GFX7-NEXT: v_writelane_b32 v23, s48, 7
+; GFX7-NEXT: v_writelane_b32 v23, s49, 8
+; GFX7-NEXT: v_writelane_b32 v23, s50, 9
+; GFX7-NEXT: v_writelane_b32 v23, s51, 10
+; GFX7-NEXT: v_writelane_b32 v23, s52, 11
+; GFX7-NEXT: v_writelane_b32 v23, s53, 12
+; GFX7-NEXT: v_writelane_b32 v23, s54, 13
+; GFX7-NEXT: v_writelane_b32 v23, s55, 14
; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
-; GFX7-NEXT: v_writelane_b32 v23, s54, 15
+; GFX7-NEXT: v_writelane_b32 v23, s30, 15
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_writelane_b32 v23, s55, 16
+; GFX7-NEXT: v_writelane_b32 v23, s31, 16
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use alloca0 v0
; GFX7-NEXT: ;;#ASMEND
@@ -73,23 +73,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_readlane_b32 s55, v23, 16
-; GFX7-NEXT: v_readlane_b32 s54, v23, 15
-; GFX7-NEXT: v_readlane_b32 s53, v23, 14
-; GFX7-NEXT: v_readlane_b32 s52, v23, 13
-; GFX7-NEXT: v_readlane_b32 s51, v23, 12
-; GFX7-NEXT: v_readlane_b32 s50, v23, 11
-; GFX7-NEXT: v_readlane_b32 s49, v23, 10
-; GFX7-NEXT: v_readlane_b32 s48, v23, 9
-; GFX7-NEXT: v_readlane_b32 s39, v23, 8
-; GFX7-NEXT: v_readlane_b32 s38, v23, 7
-; GFX7-NEXT: v_readlane_b32 s37, v23, 6
-; GFX7-NEXT: v_readlane_b32 s36, v23, 5
-; GFX7-NEXT: v_readlane_b32 s35, v23, 4
-; GFX7-NEXT: v_readlane_b32 s34, v23, 3
-; GFX7-NEXT: v_readlane_b32 s33, v23, 2
-; GFX7-NEXT: v_readlane_b32 s31, v23, 1
-; GFX7-NEXT: v_readlane_b32 s30, v23, 0
+; GFX7-NEXT: v_readlane_b32 s30, v23, 15
+; GFX7-NEXT: v_readlane_b32 s31, v23, 16
+; GFX7-NEXT: v_readlane_b32 s55, v23, 14
+; GFX7-NEXT: v_readlane_b32 s54, v23, 13
+; GFX7-NEXT: v_readlane_b32 s53, v23, 12
+; GFX7-NEXT: v_readlane_b32 s52, v23, 11
+; GFX7-NEXT: v_readlane_b32 s51, v23, 10
+; GFX7-NEXT: v_readlane_b32 s50, v23, 9
+; GFX7-NEXT: v_readlane_b32 s49, v23, 8
+; GFX7-NEXT: v_readlane_b32 s48, v23, 7
+; GFX7-NEXT: v_readlane_b32 s39, v23, 6
+; GFX7-NEXT: v_readlane_b32 s38, v23, 5
+; GFX7-NEXT: v_readlane_b32 s37, v23, 4
+; GFX7-NEXT: v_readlane_b32 s36, v23, 3
+; GFX7-NEXT: v_readlane_b32 s35, v23, 2
+; GFX7-NEXT: v_readlane_b32 s34, v23, 1
+; GFX7-NEXT: v_readlane_b32 s33, v23, 0
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: s_add_i32 s6, s32, 0x101100
; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -104,26 +104,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
; GFX8-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v23, s30, 0
-; GFX8-NEXT: v_writelane_b32 v23, s31, 1
-; GFX8-NEXT: v_writelane_b32 v23, s33, 2
-; GFX8-NEXT: v_writelane_b32 v23, s34, 3
-; GFX8-NEXT: v_writelane_b32 v23, s35, 4
-; GFX8-NEXT: v_writelane_b32 v23, s36, 5
-; GFX8-NEXT: v_writelane_b32 v23, s37, 6
-; GFX8-NEXT: v_writelane_b32 v23, s38, 7
-; GFX8-NEXT: v_writelane_b32 v23, s39, 8
-; GFX8-NEXT: v_writelane_b32 v23, s48, 9
-; GFX8-NEXT: v_writelane_b32 v23, s49, 10
-; GFX8-NEXT: v_writelane_b32 v23, s50, 11
-; GFX8-NEXT: v_writelane_b32 v23, s51, 12
-; GFX8-NEXT: v_writelane_b32 v23, s52, 13
-; GFX8-NEXT: v_writelane_b32 v23, s53, 14
+; GFX8-NEXT: v_writelane_b32 v23, s33, 0
+; GFX8-NEXT: v_writelane_b32 v23, s34, 1
+; GFX8-NEXT: v_writelane_b32 v23, s35, 2
+; GFX8-NEXT: v_writelane_b32 v23, s36, 3
+; GFX8-NEXT: v_writelane_b32 v23, s37, 4
+; GFX8-NEXT: v_writelane_b32 v23, s38, 5
+; GFX8-NEXT: v_writelane_b32 v23, s39, 6
+; GFX8-NEXT: v_writelane_b32 v23, s48, 7
+; GFX8-NEXT: v_writelane_b32 v23, s49, 8
+; GFX8-NEXT: v_writelane_b32 v23, s50, 9
+; GFX8-NEXT: v_writelane_b32 v23, s51, 10
+; GFX8-NEXT: v_writelane_b32 v23, s52, 11
+; GFX8-NEXT: v_writelane_b32 v23, s53, 12
+; GFX8-NEXT: v_writelane_b32 v23, s54, 13
+; GFX8-NEXT: v_writelane_b32 v23, s55, 14
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT: v_writelane_b32 v23, s54, 15
+; GFX8-NEXT: v_writelane_b32 v23, s30, 15
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v23, s55, 16
+; GFX8-NEXT: v_writelane_b32 v23, s31, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
@@ -141,23 +141,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v23, 16
-; GFX8-NEXT: v_readlane_b32 s54, v23, 15
-; GFX8-NEXT: v_readlane_b32 s53, v23, 14
-; GFX8-NEXT: v_readlane_b32 s52, v23, 13
-; GFX8-NEXT: v_readlane_b32 s51, v23, 12
-; GFX8-NEXT: v_readlane_b32 s50, v23, 11
-; GFX8-NEXT: v_readlane_b32 s49, v23, 10
-; GFX8-NEXT: v_readlane_b32 s48, v23, 9
-; GFX8-NEXT: v_readlane_b32 s39, v23, 8
-; GFX8-NEXT: v_readlane_b32 s38, v23, 7
-; GFX8-NEXT: v_readlane_b32 s37, v23, 6
-; GFX8-NEXT: v_readlane_b32 s36, v23, 5
-; GFX8-NEXT: v_readlane_b32 s35, v23, 4
-; GFX8-NEXT: v_readlane_b32 s34, v23, 3
-; GFX8-NEXT: v_readlane_b32 s33, v23, 2
-; GFX8-NEXT: v_readlane_b32 s31, v23, 1
-; GFX8-NEXT: v_readlane_b32 s30, v23, 0
+; GFX8-NEXT: v_readlane_b32 s30, v23, 15
+; GFX8-NEXT: v_readlane_b32 s31, v23, 16
+; GFX8-NEXT: v_readlane_b32 s55, v23, 14
+; GFX8-NEXT: v_readlane_b32 s54, v23, 13
+; GFX8-NEXT: v_readlane_b32 s53, v23, 12
+; GFX8-NEXT: v_readlane_b32 s52, v23, 11
+; GFX8-NEXT: v_readlane_b32 s51, v23, 10
+; GFX8-NEXT: v_readlane_b32 s50, v23, 9
+; GFX8-NEXT: v_readlane_b32 s49, v23, 8
+; GFX8-NEXT: v_readlane_b32 s48, v23, 7
+; GFX8-NEXT: v_readlane_b32 s39, v23, 6
+; GFX8-NEXT: v_readlane_b32 s38, v23, 5
+; GFX8-NEXT: v_readlane_b32 s37, v23, 4
+; GFX8-NEXT: v_readlane_b32 s36, v23, 3
+; GFX8-NEXT: v_readlane_b32 s35, v23, 2
+; GFX8-NEXT: v_readlane_b32 s34, v23, 1
+; GFX8-NEXT: v_readlane_b32 s33, v23, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -172,26 +172,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
; GFX900-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v23, s30, 0
-; GFX900-NEXT: v_writelane_b32 v23, s31, 1
-; GFX900-NEXT: v_writelane_b32 v23, s33, 2
-; GFX900-NEXT: v_writelane_b32 v23, s34, 3
-; GFX900-NEXT: v_writelane_b32 v23, s35, 4
-; GFX900-NEXT: v_writelane_b32 v23, s36, 5
-; GFX900-NEXT: v_writelane_b32 v23, s37, 6
-; GFX900-NEXT: v_writelane_b32 v23, s38, 7
-; GFX900-NEXT: v_writelane_b32 v23, s39, 8
-; GFX900-NEXT: v_writelane_b32 v23, s48, 9
-; GFX900-NEXT: v_writelane_b32 v23, s49, 10
-; GFX900-NEXT: v_writelane_b32 v23, s50, 11
-; GFX900-NEXT: v_writelane_b32 v23, s51, 12
-; GFX900-NEXT: v_writelane_b32 v23, s52, 13
-; GFX900-NEXT: v_writelane_b32 v23, s53, 14
+; GFX900-NEXT: v_writelane_b32 v23, s33, 0
+; GFX900-NEXT: v_writelane_b32 v23, s34, 1
+; GFX900-NEXT: v_writelane_b32 v23, s35, 2
+; GFX900-NEXT: v_writelane_b32 v23, s36, 3
+; GFX900-NEXT: v_writelane_b32 v23, s37, 4
+; GFX900-NEXT: v_writelane_b32 v23, s38, 5
+; GFX900-NEXT: v_writelane_b32 v23, s39, 6
+; GFX900-NEXT: v_writelane_b32 v23, s48, 7
+; GFX900-NEXT: v_writelane_b32 v23, s49, 8
+; GFX900-NEXT: v_writelane_b32 v23, s50, 9
+; GFX900-NEXT: v_writelane_b32 v23, s51, 10
+; GFX900-NEXT: v_writelane_b32 v23, s52, 11
+; GFX900-NEXT: v_writelane_b32 v23, s53, 12
+; GFX900-NEXT: v_writelane_b32 v23, s54, 13
+; GFX900-NEXT: v_writelane_b32 v23, s55, 14
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT: v_writelane_b32 v23, s54, 15
+; GFX900-NEXT: v_writelane_b32 v23, s30, 15
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v23, s55, 16
+; GFX900-NEXT: v_writelane_b32 v23, s31, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
@@ -208,23 +208,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v23, 16
-; GFX900-NEXT: v_readlane_b32 s54, v23, 15
-; GFX900-NEXT: v_readlane_b32 s53, v23, 14
-; GFX900-NEXT: v_readlane_b32 s52, v23, 13
-; GFX900-NEXT: v_readlane_b32 s51, v23, 12
-; GFX900-NEXT: v_readlane_b32 s50, v23, 11
-; GFX900-NEXT: v_readlane_b32 s49, v23, 10
-; GFX900-NEXT: v_readlane_b32 s48, v23, 9
-; GFX900-NEXT: v_readlane_b32 s39, v23, 8
-; GFX900-NEXT: v_readlane_b32 s38, v23, 7
-; GFX900-NEXT: v_readlane_b32 s37, v23, 6
-; GFX900-NEXT: v_readlane_b32 s36, v23, 5
-; GFX900-NEXT: v_readlane_b32 s35, v23, 4
-; GFX900-NEXT: v_readlane_b32 s34, v23, 3
-; GFX900-NEXT: v_readlane_b32 s33, v23, 2
-; GFX900-NEXT: v_readlane_b32 s31, v23, 1
-; GFX900-NEXT: v_readlane_b32 s30, v23, 0
+; GFX900-NEXT: v_readlane_b32 s30, v23, 15
+; GFX900-NEXT: v_readlane_b32 s31, v23, 16
+; GFX900-NEXT: v_readlane_b32 s55, v23, 14
+; GFX900-NEXT: v_readlane_b32 s54, v23, 13
+; GFX900-NEXT: v_readlane_b32 s53, v23, 12
+; GFX900-NEXT: v_readlane_b32 s52, v23, 11
+; GFX900-NEXT: v_readlane_b32 s51, v23, 10
+; GFX900-NEXT: v_readlane_b32 s50, v23, 9
+; GFX900-NEXT: v_readlane_b32 s49, v23, 8
+; GFX900-NEXT: v_readlane_b32 s48, v23, 7
+; GFX900-NEXT: v_readlane_b32 s39, v23, 6
+; GFX900-NEXT: v_readlane_b32 s38, v23, 5
+; GFX900-NEXT: v_readlane_b32 s37, v23, 4
+; GFX900-NEXT: v_readlane_b32 s36, v23, 3
+; GFX900-NEXT: v_readlane_b32 s35, v23, 2
+; GFX900-NEXT: v_readlane_b32 s34, v23, 1
+; GFX900-NEXT: v_readlane_b32 s33, v23, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -239,26 +239,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
; GFX942-NEXT: scratch_store_dword off, v23, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: v_writelane_b32 v23, s30, 0
-; GFX942-NEXT: v_writelane_b32 v23, s31, 1
-; GFX942-NEXT: v_writelane_b32 v23, s33, 2
-; GFX942-NEXT: v_writelane_b32 v23, s34, 3
-; GFX942-NEXT: v_writelane_b32 v23, s35, 4
-; GFX942-NEXT: v_writelane_b32 v23, s36, 5
-; GFX942-NEXT: v_writelane_b32 v23, s37, 6
-; GFX942-NEXT: v_writelane_b32 v23, s38, 7
-; GFX942-NEXT: v_writelane_b32 v23, s39, 8
-; GFX942-NEXT: v_writelane_b32 v23, s48, 9
-; GFX942-NEXT: v_writelane_b32 v23, s49, 10
-; GFX942-NEXT: v_writelane_b32 v23, s50, 11
-; GFX942-NEXT: v_writelane_b32 v23, s51, 12
-; GFX942-NEXT: v_writelane_b32 v23, s52, 13
-; GFX942-NEXT: v_writelane_b32 v23, s53, 14
+; GFX942-NEXT: v_writelane_b32 v23, s33, 0
+; GFX942-NEXT: v_writelane_b32 v23, s34, 1
+; GFX942-NEXT: v_writelane_b32 v23, s35, 2
+; GFX942-NEXT: v_writelane_b32 v23, s36, 3
+; GFX942-NEXT: v_writelane_b32 v23, s37, 4
+; GFX942-NEXT: v_writelane_b32 v23, s38, 5
+; GFX942-NEXT: v_writelane_b32 v23, s39, 6
+; GFX942-NEXT: v_writelane_b32 v23, s48, 7
+; GFX942-NEXT: v_writelane_b32 v23, s49, 8
+; GFX942-NEXT: v_writelane_b32 v23, s50, 9
+; GFX942-NEXT: v_writelane_b32 v23, s51, 10
+; GFX942-NEXT: v_writelane_b32 v23, s52, 11
+; GFX942-NEXT: v_writelane_b32 v23, s53, 12
+; GFX942-NEXT: v_writelane_b32 v23, s54, 13
+; GFX942-NEXT: v_writelane_b32 v23, s55, 14
; GFX942-NEXT: s_add_i32 s0, s32, 64
-; GFX942-NEXT: v_writelane_b32 v23, s54, 15
+; GFX942-NEXT: v_writelane_b32 v23, s30, 15
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT: v_writelane_b32 v23, s55, 16
+; GFX942-NEXT: v_writelane_b32 v23, s31, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
@@ -273,23 +273,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v23, 16
-; GFX942-NEXT: v_readlane_b32 s54, v23, 15
-; GFX942-NEXT: v_readlane_b32 s53, v23, 14
-; GFX942-NEXT: v_readlane_b32 s52, v23, 13
-; GFX942-NEXT: v_readlane_b32 s51, v23, 12
-; GFX942-NEXT: v_readlane_b32 s50, v23, 11
-; GFX942-NEXT: v_readlane_b32 s49, v23, 10
-; GFX942-NEXT: v_readlane_b32 s48, v23, 9
-; GFX942-NEXT: v_readlane_b32 s39, v23, 8
-; GFX942-NEXT: v_readlane_b32 s38, v23, 7
-; GFX942-NEXT: v_readlane_b32 s37, v23, 6
-; GFX942-NEXT: v_readlane_b32 s36, v23, 5
-; GFX942-NEXT: v_readlane_b32 s35, v23, 4
-; GFX942-NEXT: v_readlane_b32 s34, v23, 3
-; GFX942-NEXT: v_readlane_b32 s33, v23, 2
-; GFX942-NEXT: v_readlane_b32 s31, v23, 1
-; GFX942-NEXT: v_readlane_b32 s30, v23, 0
+; GFX942-NEXT: v_readlane_b32 s30, v23, 15
+; GFX942-NEXT: v_readlane_b32 s31, v23, 16
+; GFX942-NEXT: v_readlane_b32 s55, v23, 14
+; GFX942-NEXT: v_readlane_b32 s54, v23, 13
+; GFX942-NEXT: v_readlane_b32 s53, v23, 12
+; GFX942-NEXT: v_readlane_b32 s52, v23, 11
+; GFX942-NEXT: v_readlane_b32 s51, v23, 10
+; GFX942-NEXT: v_readlane_b32 s50, v23, 9
+; GFX942-NEXT: v_readlane_b32 s49, v23, 8
+; GFX942-NEXT: v_readlane_b32 s48, v23, 7
+; GFX942-NEXT: v_readlane_b32 s39, v23, 6
+; GFX942-NEXT: v_readlane_b32 s38, v23, 5
+; GFX942-NEXT: v_readlane_b32 s37, v23, 4
+; GFX942-NEXT: v_readlane_b32 s36, v23, 3
+; GFX942-NEXT: v_readlane_b32 s35, v23, 2
+; GFX942-NEXT: v_readlane_b32 s34, v23, 1
+; GFX942-NEXT: v_readlane_b32 s33, v23, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
; GFX942-NEXT: scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
@@ -305,29 +305,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_1-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT: v_writelane_b32 v23, s33, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT: v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT: v_writelane_b32 v23, s34, 1
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_writelane_b32 v23, s33, 2
-; GFX10_1-NEXT: v_writelane_b32 v23, s34, 3
-; GFX10_1-NEXT: v_writelane_b32 v23, s35, 4
-; GFX10_1-NEXT: v_writelane_b32 v23, s36, 5
-; GFX10_1-NEXT: v_writelane_b32 v23, s37, 6
-; GFX10_1-NEXT: v_writelane_b32 v23, s38, 7
-; GFX10_1-NEXT: v_writelane_b32 v23, s39, 8
-; GFX10_1-NEXT: v_writelane_b32 v23, s48, 9
-; GFX10_1-NEXT: v_writelane_b32 v23, s49, 10
-; GFX10_1-NEXT: v_writelane_b32 v23, s50, 11
-; GFX10_1-NEXT: v_writelane_b32 v23, s51, 12
-; GFX10_1-NEXT: v_writelane_b32 v23, s52, 13
-; GFX10_1-NEXT: v_writelane_b32 v23, s53, 14
-; GFX10_1-NEXT: v_writelane_b32 v23, s54, 15
-; GFX10_1-NEXT: v_writelane_b32 v23, s55, 16
+; GFX10_1-NEXT: v_writelane_b32 v23, s35, 2
+; GFX10_1-NEXT: v_writelane_b32 v23, s36, 3
+; GFX10_1-NEXT: v_writelane_b32 v23, s37, 4
+; GFX10_1-NEXT: v_writelane_b32 v23, s38, 5
+; GFX10_1-NEXT: v_writelane_b32 v23, s39, 6
+; GFX10_1-NEXT: v_writelane_b32 v23, s48, 7
+; GFX10_1-NEXT: v_writelane_b32 v23, s49, 8
+; GFX10_1-NEXT: v_writelane_b32 v23, s50, 9
+; GFX10_1-NEXT: v_writelane_b32 v23, s51, 10
+; GFX10_1-NEXT: v_writelane_b32 v23, s52, 11
+; GFX10_1-NEXT: v_writelane_b32 v23, s53, 12
+; GFX10_1-NEXT: v_writelane_b32 v23, s54, 13
+; GFX10_1-NEXT: v_writelane_b32 v23, s55, 14
+; GFX10_1-NEXT: v_writelane_b32 v23, s30, 15
+; GFX10_1-NEXT: v_writelane_b32 v23, s31, 16
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX10_1-NEXT: ;;#ASMEND
@@ -338,23 +338,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v23, 16
-; GFX10_1-NEXT: v_readlane_b32 s54, v23, 15
-; GFX10_1-NEXT: v_readlane_b32 s53, v23, 14
-; GFX10_1-NEXT: v_readlane_b32 s52, v23, 13
-; GFX10_1-NEXT: v_readlane_b32 s51, v23, 12
-; GFX10_1-NEXT: v_readlane_b32 s50, v23, 11
-; GFX10_1-NEXT: v_readlane_b32 s49, v23, 10
-; GFX10_1-NEXT: v_readlane_b32 s48, v23, 9
-; GFX10_1-NEXT: v_readlane_b32 s39, v23, 8
-; GFX10_1-NEXT: v_readlane_b32 s38, v23, 7
-; GFX10_1-NEXT: v_readlane_b32 s37, v23, 6
-; GFX10_1-NEXT: v_readlane_b32 s36, v23, 5
-; GFX10_1-NEXT: v_readlane_b32 s35, v23, 4
-; GFX10_1-NEXT: v_readlane_b32 s34, v23, 3
-; GFX10_1-NEXT: v_readlane_b32 s33, v23, 2
-; GFX10_1-NEXT: v_readlane_b32 s31, v23, 1
-; GFX10_1-NEXT: v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT: v_readlane_b32 s30, v23, 15
+; GFX10_1-NEXT: v_readlane_b32 s31, v23, 16
+; GFX10_1-NEXT: v_readlane_b32 s55, v23, 14
+; GFX10_1-NEXT: v_readlane_b32 s54, v23, 13
+; GFX10_1-NEXT: v_readlane_b32 s53, v23, 12
+; GFX10_1-NEXT: v_readlane_b32 s52, v23, 11
+; GFX10_1-NEXT: v_readlane_b32 s51, v23, 10
+; GFX10_1-NEXT: v_readlane_b32 s50, v23, 9
+; GFX10_1-NEXT: v_readlane_b32 s49, v23, 8
+; GFX10_1-NEXT: v_readlane_b32 s48, v23, 7
+; GFX10_1-NEXT: v_readlane_b32 s39, v23, 6
+; GFX10_1-NEXT: v_readlane_b32 s38, v23, 5
+; GFX10_1-NEXT: v_readlane_b32 s37, v23, 4
+; GFX10_1-NEXT: v_readlane_b32 s36, v23, 3
+; GFX10_1-NEXT: v_readlane_b32 s35, v23, 2
+; GFX10_1-NEXT: v_readlane_b32 s34, v23, 1
+; GFX10_1-NEXT: v_readlane_b32 s33, v23, 0
; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880
; GFX10_1-NEXT: buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -370,29 +370,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT: v_writelane_b32 v23, s33, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT: v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT: v_writelane_b32 v23, s34, 1
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_writelane_b32 v23, s33, 2
-; GFX10_3-NEXT: v_writelane_b32 v23, s34, 3
-; GFX10_3-NEXT: v_writelane_b32 v23, s35, 4
-; GFX10_3-NEXT: v_writelane_b32 v23, s36, 5
-; GFX10_3-NEXT: v_writelane_b32 v23, s37, 6
-; GFX10_3-NEXT: v_writelane_b32 v23, s38, 7
-; GFX10_3-NEXT: v_writelane_b32 v23, s39, 8
-; GFX10_3-NEXT: v_writelane_b32 v23, s48, 9
-; GFX10_3-NEXT: v_writelane_b32 v23, s49, 10
-; GFX10_3-NEXT: v_writelane_b32 v23, s50, 11
-; GFX10_3-NEXT: v_writelane_b32 v23, s51, 12
-; GFX10_3-NEXT: v_writelane_b32 v23, s52, 13
-; GFX10_3-NEXT: v_writelane_b32 v23, s53, 14
-; GFX10_3-NEXT: v_writelane_b32 v23, s54, 15
-; GFX10_3-NEXT: v_writelane_b32 v23, s55, 16
+; GFX10_3-NEXT: v_writelane_b32 v23, s35, 2
+; GFX10_3-NEXT: v_writelane_b32 v23, s36, 3
+; GFX10_3-NEXT: v_writelane_b32 v23, s37, 4
+; GFX10_3-NEXT: v_writelane_b32 v23, s38, 5
+; GFX10_3-NEXT: v_writelane_b32 v23, s39, 6
+; GFX10_3-NEXT: v_writelane_b32 v23, s48, 7
+; GFX10_3-NEXT: v_writelane_b32 v23, s49, 8
+; GFX10_3-NEXT: v_writelane_b32 v23, s50, 9
+; GFX10_3-NEXT: v_writelane_b32 v23, s51, 10
+; GFX10_3-NEXT: v_writelane_b32 v23, s52, 11
+; GFX10_3-NEXT: v_writelane_b32 v23, s53, 12
+; GFX10_3-NEXT: v_writelane_b32 v23, s54, 13
+; GFX10_3-NEXT: v_writelane_b32 v23, s55, 14
+; GFX10_3-NEXT: v_writelane_b32 v23, s30, 15
+; GFX10_3-NEXT: v_writelane_b32 v23, s31, 16
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX10_3-NEXT: ;;#ASMEND
@@ -403,23 +403,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v23, 16
-; GFX10_3-NEXT: v_readlane_b32 s54, v23, 15
-; GFX10_3-NEXT: v_readlane_b32 s53, v23, 14
-; GFX10_3-NEXT: v_readlane_b32 s52, v23, 13
-; GFX10_3-NEXT: v_readlane_b32 s51, v23, 12
-; GFX10_3-NEXT: v_readlane_b32 s50, v23, 11
-; GFX10_3-NEXT: v_readlane_b32 s49, v23, 10
-; GFX10_3-NEXT: v_readlane_b32 s48, v23, 9
-; GFX10_3-NEXT: v_readlane_b32 s39, v23, 8
-; GFX10_3-NEXT: v_readlane_b32 s38, v23, 7
-; GFX10_3-NEXT: v_readlane_b32 s37, v23, 6
-; GFX10_3-NEXT: v_readlane_b32 s36, v23, 5
-; GFX10_3-NEXT: v_readlane_b32 s35, v23, 4
-; GFX10_3-NEXT: v_readlane_b32 s34, v23, 3
-; GFX10_3-NEXT: v_readlane_b32 s33, v23, 2
-; GFX10_3-NEXT: v_readlane_b32 s31, v23, 1
-; GFX10_3-NEXT: v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT: v_readlane_b32 s30, v23, 15
+; GFX10_3-NEXT: v_readlane_b32 s31, v23, 16
+; GFX10_3-NEXT: v_readlane_b32 s55, v23, 14
+; GFX10_3-NEXT: v_readlane_b32 s54, v23, 13
+; GFX10_3-NEXT: v_readlane_b32 s53, v23, 12
+; GFX10_3-NEXT: v_readlane_b32 s52, v23, 11
+; GFX10_3-NEXT: v_readlane_b32 s51, v23, 10
+; GFX10_3-NEXT: v_readlane_b32 s50, v23, 9
+; GFX10_3-NEXT: v_readlane_b32 s49, v23, 8
+; GFX10_3-NEXT: v_readlane_b32 s48, v23, 7
+; GFX10_3-NEXT: v_readlane_b32 s39, v23, 6
+; GFX10_3-NEXT: v_readlane_b32 s38, v23, 5
+; GFX10_3-NEXT: v_readlane_b32 s37, v23, 4
+; GFX10_3-NEXT: v_readlane_b32 s36, v23, 3
+; GFX10_3-NEXT: v_readlane_b32 s35, v23, 2
+; GFX10_3-NEXT: v_readlane_b32 s34, v23, 1
+; GFX10_3-NEXT: v_readlane_b32 s33, v23, 0
; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
; GFX10_3-NEXT: buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -434,30 +434,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v23, s30, 0
+; GFX11-NEXT: v_writelane_b32 v23, s33, 0
; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_writelane_b32 v23, s31, 1
+; GFX11-NEXT: v_writelane_b32 v23, s34, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v23, s33, 2
-; GFX11-NEXT: v_writelane_b32 v23, s34, 3
-; GFX11-NEXT: v_writelane_b32 v23, s35, 4
-; GFX11-NEXT: v_writelane_b32 v23, s36, 5
-; GFX11-NEXT: v_writelane_b32 v23, s37, 6
-; GFX11-NEXT: v_writelane_b32 v23, s38, 7
-; GFX11-NEXT: v_writelane_b32 v23, s39, 8
-; GFX11-NEXT: v_writelane_b32 v23, s48, 9
-; GFX11-NEXT: v_writelane_b32 v23, s49, 10
-; GFX11-NEXT: v_writelane_b32 v23, s50, 11
-; GFX11-NEXT: v_writelane_b32 v23, s51, 12
-; GFX11-NEXT: v_writelane_b32 v23, s52, 13
-; GFX11-NEXT: v_writelane_b32 v23, s53, 14
-; GFX11-NEXT: v_writelane_b32 v23, s54, 15
-; GFX11-NEXT: v_writelane_b32 v23, s55, 16
+; GFX11-NEXT: v_writelane_b32 v23, s35, 2
+; GFX11-NEXT: v_writelane_b32 v23, s36, 3
+; GFX11-NEXT: v_writelane_b32 v23, s37, 4
+; GFX11-NEXT: v_writelane_b32 v23, s38, 5
+; GFX11-NEXT: v_writelane_b32 v23, s39, 6
+; GFX11-NEXT: v_writelane_b32 v23, s48, 7
+; GFX11-NEXT: v_writelane_b32 v23, s49, 8
+; GFX11-NEXT: v_writelane_b32 v23, s50, 9
+; GFX11-NEXT: v_writelane_b32 v23, s51, 10
+; GFX11-NEXT: v_writelane_b32 v23, s52, 11
+; GFX11-NEXT: v_writelane_b32 v23, s53, 12
+; GFX11-NEXT: v_writelane_b32 v23, s54, 13
+; GFX11-NEXT: v_writelane_b32 v23, s55, 14
+; GFX11-NEXT: v_writelane_b32 v23, s30, 15
+; GFX11-NEXT: v_writelane_b32 v23, s31, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX11-NEXT: ;;#ASMEND
@@ -470,23 +470,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v23, 16
-; GFX11-NEXT: v_readlane_b32 s54, v23, 15
-; GFX11-NEXT: v_readlane_b32 s53, v23, 14
-; GFX11-NEXT: v_readlane_b32 s52, v23, 13
-; GFX11-NEXT: v_readlane_b32 s51, v23, 12
-; GFX11-NEXT: v_readlane_b32 s50, v23, 11
-; GFX11-NEXT: v_readlane_b32 s49, v23, 10
-; GFX11-NEXT: v_readlane_b32 s48, v23, 9
-; GFX11-NEXT: v_readlane_b32 s39, v23, 8
-; GFX11-NEXT: v_readlane_b32 s38, v23, 7
-; GFX11-NEXT: v_readlane_b32 s37, v23, 6
-; GFX11-NEXT: v_readlane_b32 s36, v23, 5
-; GFX11-NEXT: v_readlane_b32 s35, v23, 4
-; GFX11-NEXT: v_readlane_b32 s34, v23, 3
-; GFX11-NEXT: v_readlane_b32 s33, v23, 2
-; GFX11-NEXT: v_readlane_b32 s31, v23, 1
-; GFX11-NEXT: v_readlane_b32 s30, v23, 0
+; GFX11-NEXT: v_readlane_b32 s30, v23, 15
+; GFX11-NEXT: v_readlane_b32 s31, v23, 16
+; GFX11-NEXT: v_readlane_b32 s55, v23, 14
+; GFX11-NEXT: v_readlane_b32 s54, v23, 13
+; GFX11-NEXT: v_readlane_b32 s53, v23, 12
+; GFX11-NEXT: v_readlane_b32 s52, v23, 11
+; GFX11-NEXT: v_readlane_b32 s51, v23, 10
+; GFX11-NEXT: v_readlane_b32 s50, v23, 9
+; GFX11-NEXT: v_readlane_b32 s49, v23, 8
+; GFX11-NEXT: v_readlane_b32 s48, v23, 7
+; GFX11-NEXT: v_readlane_b32 s39, v23, 6
+; GFX11-NEXT: v_readlane_b32 s38, v23, 5
+; GFX11-NEXT: v_readlane_b32 s37, v23, 4
+; GFX11-NEXT: v_readlane_b32 s36, v23, 3
+; GFX11-NEXT: v_readlane_b32 s35, v23, 2
+; GFX11-NEXT: v_readlane_b32 s34, v23, 1
+; GFX11-NEXT: v_readlane_b32 s33, v23, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
; GFX11-NEXT: scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
@@ -505,28 +505,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v23, s30, 0
+; GFX12-NEXT: v_writelane_b32 v23, s33, 0
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_writelane_b32 v23, s31, 1
-; GFX12-NEXT: v_writelane_b32 v23, s33, 2
-; GFX12-NEXT: v_writelane_b32 v23, s34, 3
-; GFX12-NEXT: v_writelane_b32 v23, s35, 4
-; GFX12-NEXT: v_writelane_b32 v23, s36, 5
-; GFX12-NEXT: v_writelane_b32 v23, s37, 6
-; GFX12-NEXT: v_writelane_b32 v23, s38, 7
-; GFX12-NEXT: v_writelane_b32 v23, s39, 8
-; GFX12-NEXT: v_writelane_b32 v23, s48, 9
-; GFX12-NEXT: v_writelane_b32 v23, s49, 10
-; GFX12-NEXT: v_writelane_b32 v23, s50, 11
-; GFX12-NEXT: v_writelane_b32 v23, s51, 12
-; GFX12-NEXT: v_writelane_b32 v23, s52, 13
-; GFX12-NEXT: v_writelane_b32 v23, s53, 14
-; GFX12-NEXT: v_writelane_b32 v23, s54, 15
-; GFX12-NEXT: v_writelane_b32 v23, s55, 16
+; GFX12-NEXT: v_writelane_b32 v23, s34, 1
+; GFX12-NEXT: v_writelane_b32 v23, s35, 2
+; GFX12-NEXT: v_writelane_b32 v23, s36, 3
+; GFX12-NEXT: v_writelane_b32 v23, s37, 4
+; GFX12-NEXT: v_writelane_b32 v23, s38, 5
+; GFX12-NEXT: v_writelane_b32 v23, s39, 6
+; GFX12-NEXT: v_writelane_b32 v23, s48, 7
+; GFX12-NEXT: v_writelane_b32 v23, s49, 8
+; GFX12-NEXT: v_writelane_b32 v23, s50, 9
+; GFX12-NEXT: v_writelane_b32 v23, s51, 10
+; GFX12-NEXT: v_writelane_b32 v23, s52, 11
+; GFX12-NEXT: v_writelane_b32 v23, s53, 12
+; GFX12-NEXT: v_writelane_b32 v23, s54, 13
+; GFX12-NEXT: v_writelane_b32 v23, s55, 14
+; GFX12-NEXT: v_writelane_b32 v23, s30, 15
+; GFX12-NEXT: v_writelane_b32 v23, s31, 16
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX12-NEXT: ;;#ASMEND
@@ -540,23 +540,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v23, 16
-; GFX12-NEXT: v_readlane_b32 s54, v23, 15
-; GFX12-NEXT: v_readlane_b32 s53, v23, 14
-; GFX12-NEXT: v_readlane_b32 s52, v23, 13
-; GFX12-NEXT: v_readlane_b32 s51, v23, 12
-; GFX12-NEXT: v_readlane_b32 s50, v23, 11
-; GFX12-NEXT: v_readlane_b32 s49, v23, 10
-; GFX12-NEXT: v_readlane_b32 s48, v23, 9
-; GFX12-NEXT: v_readlane_b32 s39, v23, 8
-; GFX12-NEXT: v_readlane_b32 s38, v23, 7
-; GFX12-NEXT: v_readlane_b32 s37, v23, 6
-; GFX12-NEXT: v_readlane_b32 s36, v23, 5
-; GFX12-NEXT: v_readlane_b32 s35, v23, 4
-; GFX12-NEXT: v_readlane_b32 s34, v23, 3
-; GFX12-NEXT: v_readlane_b32 s33, v23, 2
-; GFX12-NEXT: v_readlane_b32 s31, v23, 1
-; GFX12-NEXT: v_readlane_b32 s30, v23, 0
+; GFX12-NEXT: v_readlane_b32 s30, v23, 15
+; GFX12-NEXT: v_readlane_b32 s31, v23, 16
+; GFX12-NEXT: v_readlane_b32 s55, v23, 14
+; GFX12-NEXT: v_readlane_b32 s54, v23, 13
+; GFX12-NEXT: v_readlane_b32 s53, v23, 12
+; GFX12-NEXT: v_readlane_b32 s52, v23, 11
+; GFX12-NEXT: v_readlane_b32 s51, v23, 10
+; GFX12-NEXT: v_readlane_b32 s50, v23, 9
+; GFX12-NEXT: v_readlane_b32 s49, v23, 8
+; GFX12-NEXT: v_readlane_b32 s48, v23, 7
+; GFX12-NEXT: v_readlane_b32 s39, v23, 6
+; GFX12-NEXT: v_readlane_b32 s38, v23, 5
+; GFX12-NEXT: v_readlane_b32 s37, v23, 4
+; GFX12-NEXT: v_readlane_b32 s36, v23, 3
+; GFX12-NEXT: v_readlane_b32 s35, v23, 2
+; GFX12-NEXT: v_readlane_b32 s34, v23, 1
+; GFX12-NEXT: v_readlane_b32 s33, v23, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -613,24 +613,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX7-NEXT: s_add_i32 s6, s32, 0x100400
; GFX7-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: v_writelane_b32 v21, s30, 0
-; GFX7-NEXT: v_writelane_b32 v21, s31, 1
-; GFX7-NEXT: v_writelane_b32 v21, s33, 2
-; GFX7-NEXT: v_writelane_b32 v21, s34, 3
-; GFX7-NEXT: v_writelane_b32 v21, s35, 4
-; GFX7-NEXT: v_writelane_b32 v21, s36, 5
-; GFX7-NEXT: v_writelane_b32 v21, s37, 6
-; GFX7-NEXT: v_writelane_b32 v21, s38, 7
-; GFX7-NEXT: v_writelane_b32 v21, s39, 8
-; GFX7-NEXT: v_writelane_b32 v21, s48, 9
-; GFX7-NEXT: v_writelane_b32 v21, s49, 10
-; GFX7-NEXT: v_writelane_b32 v21, s50, 11
-; GFX7-NEXT: v_writelane_b32 v21, s51, 12
-; GFX7-NEXT: v_writelane_b32 v21, s52, 13
-; GFX7-NEXT: v_writelane_b32 v21, s53, 14
-; GFX7-NEXT: v_writelane_b32 v21, s54, 15
+; GFX7-NEXT: v_writelane_b32 v21, s33, 0
+; GFX7-NEXT: v_writelane_b32 v21, s34, 1
+; GFX7-NEXT: v_writelane_b32 v21, s35, 2
+; GFX7-NEXT: v_writelane_b32 v21, s36, 3
+; GFX7-NEXT: v_writelane_b32 v21, s37, 4
+; GFX7-NEXT: v_writelane_b32 v21, s38, 5
+; GFX7-NEXT: v_writelane_b32 v21, s39, 6
+; GFX7-NEXT: v_writelane_b32 v21, s48, 7
+; GFX7-NEXT: v_writelane_b32 v21, s49, 8
+; GFX7-NEXT: v_writelane_b32 v21, s50, 9
+; GFX7-NEXT: v_writelane_b32 v21, s51, 10
+; GFX7-NEXT: v_writelane_b32 v21, s52, 11
+; GFX7-NEXT: v_writelane_b32 v21, s53, 12
+; GFX7-NEXT: v_writelane_b32 v21, s54, 13
+; GFX7-NEXT: v_writelane_b32 v21, s55, 14
+; GFX7-NEXT: v_writelane_b32 v21, s30, 15
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_writelane_b32 v21, s55, 16
+; GFX7-NEXT: v_writelane_b32 v21, s31, 16
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX7-NEXT: ;;#ASMEND
@@ -640,23 +640,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_readlane_b32 s55, v21, 16
-; GFX7-NEXT: v_readlane_b32 s54, v21, 15
-; GFX7-NEXT: v_readlane_b32 s53, v21, 14
-; GFX7-NEXT: v_readlane_b32 s52, v21, 13
-; GFX7-NEXT: v_readlane_b32 s51, v21, 12
-; GFX7-NEXT: v_readlane_b32 s50, v21, 11
-; GFX7-NEXT: v_readlane_b32 s49, v21, 10
-; GFX7-NEXT: v_readlane_b32 s48, v21, 9
-; GFX7-NEXT: v_readlane_b32 s39, v21, 8
-; GFX7-NEXT: v_readlane_b32 s38, v21, 7
-; GFX7-NEXT: v_readlane_b32 s37, v21, 6
-; GFX7-NEXT: v_readlane_b32 s36, v21, 5
-; GFX7-NEXT: v_readlane_b32 s35, v21, 4
-; GFX7-NEXT: v_readlane_b32 s34, v21, 3
-; GFX7-NEXT: v_readlane_b32 s33, v21, 2
-; GFX7-NEXT: v_readlane_b32 s31, v21, 1
-; GFX7-NEXT: v_readlane_b32 s30, v21, 0
+; GFX7-NEXT: v_readlane_b32 s30, v21, 15
+; GFX7-NEXT: v_readlane_b32 s31, v21, 16
+; GFX7-NEXT: v_readlane_b32 s55, v21, 14
+; GFX7-NEXT: v_readlane_b32 s54, v21, 13
+; GFX7-NEXT: v_readlane_b32 s53, v21, 12
+; GFX7-NEXT: v_readlane_b32 s52, v21, 11
+; GFX7-NEXT: v_readlane_b32 s51, v21, 10
+; GFX7-NEXT: v_readlane_b32 s50, v21, 9
+; GFX7-NEXT: v_readlane_b32 s49, v21, 8
+; GFX7-NEXT: v_readlane_b32 s48, v21, 7
+; GFX7-NEXT: v_readlane_b32 s39, v21, 6
+; GFX7-NEXT: v_readlane_b32 s38, v21, 5
+; GFX7-NEXT: v_readlane_b32 s37, v21, 4
+; GFX7-NEXT: v_readlane_b32 s36, v21, 3
+; GFX7-NEXT: v_readlane_b32 s35, v21, 2
+; GFX7-NEXT: v_readlane_b32 s34, v21, 1
+; GFX7-NEXT: v_readlane_b32 s33, v21, 0
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: s_add_i32 s6, s32, 0x100400
; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -671,24 +671,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX8-NEXT: s_add_i32 s6, s32, 0x100400
; GFX8-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v21, s30, 0
-; GFX8-NEXT: v_writelane_b32 v21, s31, 1
-; GFX8-NEXT: v_writelane_b32 v21, s33, 2
-; GFX8-NEXT: v_writelane_b32 v21, s34, 3
-; GFX8-NEXT: v_writelane_b32 v21, s35, 4
-; GFX8-NEXT: v_writelane_b32 v21, s36, 5
-; GFX8-NEXT: v_writelane_b32 v21, s37, 6
-; GFX8-NEXT: v_writelane_b32 v21, s38, 7
-; GFX8-NEXT: v_writelane_b32 v21, s39, 8
-; GFX8-NEXT: v_writelane_b32 v21, s48, 9
-; GFX8-NEXT: v_writelane_b32 v21, s49, 10
-; GFX8-NEXT: v_writelane_b32 v21, s50, 11
-; GFX8-NEXT: v_writelane_b32 v21, s51, 12
-; GFX8-NEXT: v_writelane_b32 v21, s52, 13
-; GFX8-NEXT: v_writelane_b32 v21, s53, 14
-; GFX8-NEXT: v_writelane_b32 v21, s54, 15
+; GFX8-NEXT: v_writelane_b32 v21, s33, 0
+; GFX8-NEXT: v_writelane_b32 v21, s34, 1
+; GFX8-NEXT: v_writelane_b32 v21, s35, 2
+; GFX8-NEXT: v_writelane_b32 v21, s36, 3
+; GFX8-NEXT: v_writelane_b32 v21, s37, 4
+; GFX8-NEXT: v_writelane_b32 v21, s38, 5
+; GFX8-NEXT: v_writelane_b32 v21, s39, 6
+; GFX8-NEXT: v_writelane_b32 v21, s48, 7
+; GFX8-NEXT: v_writelane_b32 v21, s49, 8
+; GFX8-NEXT: v_writelane_b32 v21, s50, 9
+; GFX8-NEXT: v_writelane_b32 v21, s51, 10
+; GFX8-NEXT: v_writelane_b32 v21, s52, 11
+; GFX8-NEXT: v_writelane_b32 v21, s53, 12
+; GFX8-NEXT: v_writelane_b32 v21, s54, 13
+; GFX8-NEXT: v_writelane_b32 v21, s55, 14
+; GFX8-NEXT: v_writelane_b32 v21, s30, 15
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v21, s55, 16
+; GFX8-NEXT: v_writelane_b32 v21, s31, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX8-NEXT: ;;#ASMEND
@@ -699,23 +699,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v21, 16
-; GFX8-NEXT: v_readlane_b32 s54, v21, 15
-; GFX8-NEXT: v_readlane_b32 s53, v21, 14
-; GFX8-NEXT: v_readlane_b32 s52, v21, 13
-; GFX8-NEXT: v_readlane_b32 s51, v21, 12
-; GFX8-NEXT: v_readlane_b32 s50, v21, 11
-; GFX8-NEXT: v_readlane_b32 s49, v21, 10
-; GFX8-NEXT: v_readlane_b32 s48, v21, 9
-; GFX8-NEXT: v_readlane_b32 s39, v21, 8
-; GFX8-NEXT: v_readlane_b32 s38, v21, 7
-; GFX8-NEXT: v_readlane_b32 s37, v21, 6
-; GFX8-NEXT: v_readlane_b32 s36, v21, 5
-; GFX8-NEXT: v_readlane_b32 s35, v21, 4
-; GFX8-NEXT: v_readlane_b32 s34, v21, 3
-; GFX8-NEXT: v_readlane_b32 s33, v21, 2
-; GFX8-NEXT: v_readlane_b32 s31, v21, 1
-; GFX8-NEXT: v_readlane_b32 s30, v21, 0
+; GFX8-NEXT: v_readlane_b32 s30, v21, 15
+; GFX8-NEXT: v_readlane_b32 s31, v21, 16
+; GFX8-NEXT: v_readlane_b32 s55, v21, 14
+; GFX8-NEXT: v_readlane_b32 s54, v21, 13
+; GFX8-NEXT: v_readlane_b32 s53, v21, 12
+; GFX8-NEXT: v_readlane_b32 s52, v21, 11
+; GFX8-NEXT: v_readlane_b32 s51, v21, 10
+; GFX8-NEXT: v_readlane_b32 s50, v21, 9
+; GFX8-NEXT: v_readlane_b32 s49, v21, 8
+; GFX8-NEXT: v_readlane_b32 s48, v21, 7
+; GFX8-NEXT: v_readlane_b32 s39, v21, 6
+; GFX8-NEXT: v_readlane_b32 s38, v21, 5
+; GFX8-NEXT: v_readlane_b32 s37, v21, 4
+; GFX8-NEXT: v_readlane_b32 s36, v21, 3
+; GFX8-NEXT: v_readlane_b32 s35, v21, 2
+; GFX8-NEXT: v_readlane_b32 s34, v21, 1
+; GFX8-NEXT: v_readlane_b32 s33, v21, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x100400
; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -730,24 +730,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX900-NEXT: s_add_i32 s6, s32, 0x100400
; GFX900-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v21, s30, 0
-; GFX900-NEXT: v_writelane_b32 v21, s31, 1
-; GFX900-NEXT: v_writelane_b32 v21, s33, 2
-; GFX900-NEXT: v_writelane_b32 v21, s34, 3
-; GFX900-NEXT: v_writelane_b32 v21, s35, 4
-; GFX900-NEXT: v_writelane_b32 v21, s36, 5
-; GFX900-NEXT: v_writelane_b32 v21, s37, 6
-; GFX900-NEXT: v_writelane_b32 v21, s38, 7
-; GFX900-NEXT: v_writelane_b32 v21, s39, 8
-; GFX900-NEXT: v_writelane_b32 v21, s48, 9
-; GFX900-NEXT: v_writelane_b32 v21, s49, 10
-; GFX900-NEXT: v_writelane_b32 v21, s50, 11
-; GFX900-NEXT: v_writelane_b32 v21, s51, 12
-; GFX900-NEXT: v_writelane_b32 v21, s52, 13
-; GFX900-NEXT: v_writelane_b32 v21, s53, 14
-; GFX900-NEXT: v_writelane_b32 v21, s54, 15
+; GFX900-NEXT: v_writelane_b32 v21, s33, 0
+; GFX900-NEXT: v_writelane_b32 v21, s34, 1
+; GFX900-NEXT: v_writelane_b32 v21, s35, 2
+; GFX900-NEXT: v_writelane_b32 v21, s36, 3
+; GFX900-NEXT: v_writelane_b32 v21, s37, 4
+; GFX900-NEXT: v_writelane_b32 v21, s38, 5
+; GFX900-NEXT: v_writelane_b32 v21, s39, 6
+; GFX900-NEXT: v_writelane_b32 v21, s48, 7
+; GFX900-NEXT: v_writelane_b32 v21, s49, 8
+; GFX900-NEXT: v_writelane_b32 v21, s50, 9
+; GFX900-NEXT: v_writelane_b32 v21, s51, 10
+; GFX900-NEXT: v_writelane_b32 v21, s52, 11
+; GFX900-NEXT: v_writelane_b32 v21, s53, 12
+; GFX900-NEXT: v_writelane_b32 v21, s54, 13
+; GFX900-NEXT: v_writelane_b32 v21, s55, 14
+; GFX900-NEXT: v_writelane_b32 v21, s30, 15
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v21, s55, 16
+; GFX900-NEXT: v_writelane_b32 v21, s31, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX900-NEXT: ;;#ASMEND
@@ -758,23 +758,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v21, 16
-; GFX900-NEXT: v_readlane_b32 s54, v21, 15
-; GFX900-NEXT: v_readlane_b32 s53, v21, 14
-; GFX900-NEXT: v_readlane_b32 s52, v21, 13
-; GFX900-NEXT: v_readlane_b32 s51, v21, 12
-; GFX900-NEXT: v_readlane_b32 s50, v21, 11
-; GFX900-NEXT: v_readlane_b32 s49, v21, 10
-; GFX900-NEXT: v_readlane_b32 s48, v21, 9
-; GFX900-NEXT: v_readlane_b32 s39, v21, 8
-; GFX900-NEXT: v_readlane_b32 s38, v21, 7
-; GFX900-NEXT: v_readlane_b32 s37, v21, 6
-; GFX900-NEXT: v_readlane_b32 s36, v21, 5
-; GFX900-NEXT: v_readlane_b32 s35, v21, 4
-; GFX900-NEXT: v_readlane_b32 s34, v21, 3
-; GFX900-NEXT: v_readlane_b32 s33, v21, 2
-; GFX900-NEXT: v_readlane_b32 s31, v21, 1
-; GFX900-NEXT: v_readlane_b32 s30, v21, 0
+; GFX900-NEXT: v_readlane_b32 s30, v21, 15
+; GFX900-NEXT: v_readlane_b32 s31, v21, 16
+; GFX900-NEXT: v_readlane_b32 s55, v21, 14
+; GFX900-NEXT: v_readlane_b32 s54, v21, 13
+; GFX900-NEXT: v_readlane_b32 s53, v21, 12
+; GFX900-NEXT: v_readlane_b32 s52, v21, 11
+; GFX900-NEXT: v_readlane_b32 s51, v21, 10
+; GFX900-NEXT: v_readlane_b32 s50, v21, 9
+; GFX900-NEXT: v_readlane_b32 s49, v21, 8
+; GFX900-NEXT: v_readlane_b32 s48, v21, 7
+; GFX900-NEXT: v_readlane_b32 s39, v21, 6
+; GFX900-NEXT: v_readlane_b32 s38, v21, 5
+; GFX900-NEXT: v_readlane_b32 s37, v21, 4
+; GFX900-NEXT: v_readlane_b32 s36, v21, 3
+; GFX900-NEXT: v_readlane_b32 s35, v21, 2
+; GFX900-NEXT: v_readlane_b32 s34, v21, 1
+; GFX900-NEXT: v_readlane_b32 s33, v21, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: s_add_i32 s6, s32, 0x100400
; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -789,24 +789,25 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX942-NEXT: s_add_i32 s2, s32, 0x4010
; GFX942-NEXT: scratch_store_dword off, v21, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: v_writelane_b32 v21, s30, 0
-; GFX942-NEXT: v_writelane_b32 v21, s31, 1
-; GFX942-NEXT: v_writelane_b32 v21, s33, 2
-; GFX942-NEXT: v_writelane_b32 v21, s34, 3
-; GFX942-NEXT: v_writelane_b32 v21, s35, 4
-; GFX942-NEXT: v_writelane_b32 v21, s36, 5
-; GFX942-NEXT: v_writelane_b32 v21, s37, 6
-; GFX942-NEXT: v_writelane_b32 v21, s38, 7
-; GFX942-NEXT: v_writelane_b32 v21, s39, 8
-; GFX942-NEXT: v_writelane_b32 v21, s48, 9
-; GFX942-NEXT: v_writelane_b32 v21, s49, 10
-; GFX942-NEXT: v_writelane_b32 v21, s50, 11
-; GFX942-NEXT: v_writelane_b32 v21, s51, 12
-; GFX942-NEXT: v_writelane_b32 v21, s52, 13
-; GFX942-NEXT: v_writelane_b32 v21, s53, 14
-; GFX942-NEXT: v_writelane_b32 v21, s54, 15
+; GFX942-NEXT: v_writelane_b32 v21, s33, 0
+; GFX942-NEXT: v_writelane_b32 v21, s34, 1
+; GFX942-NEXT: v_writelane_b32 v21, s35, 2
+; GFX942-NEXT: v_writelane_b32 v21, s36, 3
+; GFX942-NEXT: v_writelane_b32 v21, s37, 4
+; GFX942-NEXT: v_writelane_b32 v21, s38, 5
+; GFX942-NEXT: v_writelane_b32 v21, s39, 6
+; GFX942-NEXT: v_writelane_b32 v21, s48, 7
+; GFX942-NEXT: v_writelane_b32 v21, s49, 8
+; GFX942-NEXT: v_writelane_b32 v21, s50, 9
+; GFX942-NEXT: v_writelane_b32 v21, s51, 10
+; GFX942-NEXT: v_writelane_b32 v21, s52, 11
+; GFX942-NEXT: v_writelane_b32 v21, s53, 12
+; GFX942-NEXT: v_writelane_b32 v21, s54, 13
+; GFX942-NEXT: v_writelane_b32 v21, s55, 14
+; GFX942-NEXT: v_writelane_b32 v21, s30, 15
; GFX942-NEXT: s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT: v_writelane_b32 v21, s55, 16
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_writelane_b32 v21, s31, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX942-NEXT: ;;#ASMEND
@@ -818,23 +819,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v21, 16
-; GFX942-NEXT: v_readlane_b32 s54, v21, 15
-; GFX942-NEXT: v_readlane_b32 s53, v21, 14
-; GFX942-NEXT: v_readlane_b32 s52, v21, 13
-; GFX942-NEXT: v_readlane_b32 s51, v21, 12
-; GFX942-NEXT: v_readlane_b32 s50, v21, 11
-; GFX942-NEXT: v_readlane_b32 s49, v21, 10
-; GFX942-NEXT: v_readlane_b32 s48, v21, 9
-; GFX942-NEXT: v_readlane_b32 s39, v21, 8
-; GFX942-NEXT: v_readlane_b32 s38, v21, 7
-; GFX942-NEXT: v_readlane_b32 s37, v21, 6
-; GFX942-NEXT: v_readlane_b32 s36, v21, 5
-; GFX942-NEXT: v_readlane_b32 s35, v21, 4
-; GFX942-NEXT: v_readlane_b32 s34, v21, 3
-; GFX942-NEXT: v_readlane_b32 s33, v21, 2
-; GFX942-NEXT: v_readlane_b32 s31, v21, 1
-; GFX942-NEXT: v_readlane_b32 s30, v21, 0
+; GFX942-NEXT: v_readlane_b32 s30, v21, 15
+; GFX942-NEXT: v_readlane_b32 s31, v21, 16
+; GFX942-NEXT: v_readlane_b32 s55, v21, 14
+; GFX942-NEXT: v_readlane_b32 s54, v21, 13
+; GFX942-NEXT: v_readlane_b32 s53, v21, 12
+; GFX942-NEXT: v_readlane_b32 s52, v21, 11
+; GFX942-NEXT: v_readlane_b32 s51, v21, 10
+; GFX942-NEXT: v_readlane_b32 s50, v21, 9
+; GFX942-NEXT: v_readlane_b32 s49, v21, 8
+; GFX942-NEXT: v_readlane_b32 s48, v21, 7
+; GFX942-NEXT: v_readlane_b32 s39, v21, 6
+; GFX942-NEXT: v_readlane_b32 s38, v21, 5
+; GFX942-NEXT: v_readlane_b32 s37, v21, 4
+; GFX942-NEXT: v_readlane_b32 s36, v21, 3
+; GFX942-NEXT: v_readlane_b32 s35, v21, 2
+; GFX942-NEXT: v_readlane_b32 s34, v21, 1
+; GFX942-NEXT: v_readlane_b32 s33, v21, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: s_add_i32 s2, s32, 0x4010
; GFX942-NEXT: scratch_load_dword v21, off, s2 ; 4-byte Folded Reload
@@ -850,24 +851,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_1-NEXT: buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v21, s30, 0
+; GFX10_1-NEXT: v_writelane_b32 v21, s33, 0
; GFX10_1-NEXT: s_and_b32 s59, 0, exec_lo
-; GFX10_1-NEXT: v_writelane_b32 v21, s31, 1
-; GFX10_1-NEXT: v_writelane_b32 v21, s33, 2
-; GFX10_1-NEXT: v_writelane_b32 v21, s34, 3
-; GFX10_1-NEXT: v_writelane_b32 v21, s35, 4
-; GFX10_1-NEXT: v_writelane_b32 v21, s36, 5
-; GFX10_1-NEXT: v_writelane_b32 v21, s37, 6
-; GFX10_1-NEXT: v_writelane_b32 v21, s38, 7
-; GFX10_1-NEXT: v_writelane_b32 v21, s39, 8
-; GFX10_1-NEXT: v_writelane_b32 v21, s48, 9
-; GFX10_1-NEXT: v_writelane_b32 v21, s49, 10
-; GFX10_1-NEXT: v_writelane_b32 v21, s50, 11
-; GFX10_1-NEXT: v_writelane_b32 v21, s51, 12
-; GFX10_1-NEXT: v_writelane_b32 v21, s52, 13
-; GFX10_1-NEXT: v_writelane_b32 v21, s53, 14
-; GFX10_1-NEXT: v_writelane_b32 v21, s54, 15
-; GFX10_1-NEXT: v_writelane_b32 v21, s55, 16
+; GFX10_1-NEXT: v_writelane_b32 v21, s34, 1
+; GFX10_1-NEXT: v_writelane_b32 v21, s35, 2
+; GFX10_1-NEXT: v_writelane_b32 v21, s36, 3
+; GFX10_1-NEXT: v_writelane_b32 v21, s37, 4
+; GFX10_1-NEXT: v_writelane_b32 v21, s38, 5
+; GFX10_1-NEXT: v_writelane_b32 v21, s39, 6
+; GFX10_1-NEXT: v_writelane_b32 v21, s48, 7
+; GFX10_1-NEXT: v_writelane_b32 v21, s49, 8
+; GFX10_1-NEXT: v_writelane_b32 v21, s50, 9
+; GFX10_1-NEXT: v_writelane_b32 v21, s51, 10
+; GFX10_1-NEXT: v_writelane_b32 v21, s52, 11
+; GFX10_1-NEXT: v_writelane_b32 v21, s53, 12
+; GFX10_1-NEXT: v_writelane_b32 v21, s54, 13
+; GFX10_1-NEXT: v_writelane_b32 v21, s55, 14
+; GFX10_1-NEXT: v_writelane_b32 v21, s30, 15
+; GFX10_1-NEXT: v_writelane_b32 v21, s31, 16
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX10_1-NEXT: ;;#ASMEND
@@ -878,23 +879,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v21, 16
-; GFX10_1-NEXT: v_readlane_b32 s54, v21, 15
-; GFX10_1-NEXT: v_readlane_b32 s53, v21, 14
-; GFX10_1-NEXT: v_readlane_b32 s52, v21, 13
-; GFX10_1-NEXT: v_readlane_b32 s51, v21, 12
-; GFX10_1-NEXT: v_readlane_b32 s50, v21, 11
-; GFX10_1-NEXT: v_readlane_b32 s49, v21, 10
-; GFX10_1-NEXT: v_readlane_b32 s48, v21, 9
-; GFX10_1-NEXT: v_readlane_b32 s39, v21, 8
-; GFX10_1-NEXT: v_readlane_b32 s38, v21, 7
-; GFX10_1-NEXT: v_readlane_b32 s37, v21, 6
-; GFX10_1-NEXT: v_readlane_b32 s36, v21, 5
-; GFX10_1-NEXT: v_readlane_b32 s35, v21, 4
-; GFX10_1-NEXT: v_readlane_b32 s34, v21, 3
-; GFX10_1-NEXT: v_readlane_b32 s33, v21, 2
-; GFX10_1-NEXT: v_readlane_b32 s31, v21, 1
-; GFX10_1-NEXT: v_readlane_b32 s30, v21, 0
+; GFX10_1-NEXT: v_readlane_b32 s30, v21, 15
+; GFX10_1-NEXT: v_readlane_b32 s31, v21, 16
+; GFX10_1-NEXT: v_readlane_b32 s55, v21, 14
+; GFX10_1-NEXT: v_readlane_b32 s54, v21, 13
+; GFX10_1-NEXT: v_readlane_b32 s53, v21, 12
+; GFX10_1-NEXT: v_readlane_b32 s52, v21, 11
+; GFX10_1-NEXT: v_readlane_b32 s51, v21, 10
+; GFX10_1-NEXT: v_readlane_b32 s50, v21, 9
+; GFX10_1-NEXT: v_readlane_b32 s49, v21, 8
+; GFX10_1-NEXT: v_readlane_b32 s48, v21, 7
+; GFX10_1-NEXT: v_readlane_b32 s39, v21, 6
+; GFX10_1-NEXT: v_readlane_b32 s38, v21, 5
+; GFX10_1-NEXT: v_readlane_b32 s37, v21, 4
+; GFX10_1-NEXT: v_readlane_b32 s36, v21, 3
+; GFX10_1-NEXT: v_readlane_b32 s35, v21, 2
+; GFX10_1-NEXT: v_readlane_b32 s34, v21, 1
+; GFX10_1-NEXT: v_readlane_b32 s33, v21, 0
; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80200
; GFX10_1-NEXT: buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -910,24 +911,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80200
; GFX10_3-NEXT: buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v21, s30, 0
+; GFX10_3-NEXT: v_writelane_b32 v21, s33, 0
; GFX10_3-NEXT: s_and_b32 s59, 0, exec_lo
-; GFX10_3-NEXT: v_writelane_b32 v21, s31, 1
-; GFX10_3-NEXT: v_writelane_b32 v21, s33, 2
-; GFX10_3-NEXT: v_writelane_b32 v21, s34, 3
-; GFX10_3-NEXT: v_writelane_b32 v21, s35, 4
-; GFX10_3-NEXT: v_writelane_b32 v21, s36, 5
-; GFX10_3-NEXT: v_writelane_b32 v21, s37, 6
-; GFX10_3-NEXT: v_writelane_b32 v21, s38, 7
-; GFX10_3-NEXT: v_writelane_b32 v21, s39, 8
-; GFX10_3-NEXT: v_writelane_b32 v21, s48, 9
-; GFX10_3-NEXT: v_writelane_b32 v21, s49, 10
-; GFX10_3-NEXT: v_writelane_b32 v21, s50, 11
-; GFX10_3-NEXT: v_writelane_b32 v21, s51, 12
-; GFX10_3-NEXT: v_writelane_b32 v21, s52, 13
-; GFX10_3-NEXT: v_writelane_b32 v21, s53, 14
-; GFX10_3-NEXT: v_writelane_b32 v21, s54, 15
-; GFX10_3-NEXT: v_writelane_b32 v21, s55, 16
+; GFX10_3-NEXT: v_writelane_b32 v21, s34, 1
+; GFX10_3-NEXT: v_writelane_b32 v21, s35, 2
+; GFX10_3-NEXT: v_writelane_b32 v21, s36, 3
+; GFX10_3-NEXT: v_writelane_b32 v21, s37, 4
+; GFX10_3-NEXT: v_writelane_b32 v21, s38, 5
+; GFX10_3-NEXT: v_writelane_b32 v21, s39, 6
+; GFX10_3-NEXT: v_writelane_b32 v21, s48, 7
+; GFX10_3-NEXT: v_writelane_b32 v21, s49, 8
+; GFX10_3-NEXT: v_writelane_b32 v21, s50, 9
+; GFX10_3-NEXT: v_writelane_b32 v21, s51, 10
+; GFX10_3-NEXT: v_writelane_b32 v21, s52, 11
+; GFX10_3-NEXT: v_writelane_b32 v21, s53, 12
+; GFX10_3-NEXT: v_writelane_b32 v21, s54, 13
+; GFX10_3-NEXT: v_writelane_b32 v21, s55, 14
+; GFX10_3-NEXT: v_writelane_b32 v21, s30, 15
+; GFX10_3-NEXT: v_writelane_b32 v21, s31, 16
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX10_3-NEXT: ;;#ASMEND
@@ -938,23 +939,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v21, 16
-; GFX10_3-NEXT: v_readlane_b32 s54, v21, 15
-; GFX10_3-NEXT: v_readlane_b32 s53, v21, 14
-; GFX10_3-NEXT: v_readlane_b32 s52, v21, 13
-; GFX10_3-NEXT: v_readlane_b32 s51, v21, 12
-; GFX10_3-NEXT: v_readlane_b32 s50, v21, 11
-; GFX10_3-NEXT: v_readlane_b32 s49, v21, 10
-; GFX10_3-NEXT: v_readlane_b32 s48, v21, 9
-; GFX10_3-NEXT: v_readlane_b32 s39, v21, 8
-; GFX10_3-NEXT: v_readlane_b32 s38, v21, 7
-; GFX10_3-NEXT: v_readlane_b32 s37, v21, 6
-; GFX10_3-NEXT: v_readlane_b32 s36, v21, 5
-; GFX10_3-NEXT: v_readlane_b32 s35, v21, 4
-; GFX10_3-NEXT: v_readlane_b32 s34, v21, 3
-; GFX10_3-NEXT: v_readlane_b32 s33, v21, 2
-; GFX10_3-NEXT: v_readlane_b32 s31, v21, 1
-; GFX10_3-NEXT: v_readlane_b32 s30, v21, 0
+; GFX10_3-NEXT: v_readlane_b32 s30, v21, 15
+; GFX10_3-NEXT: v_readlane_b32 s31, v21, 16
+; GFX10_3-NEXT: v_readlane_b32 s55, v21, 14
+; GFX10_3-NEXT: v_readlane_b32 s54, v21, 13
+; GFX10_3-NEXT: v_readlane_b32 s53, v21, 12
+; GFX10_3-NEXT: v_readlane_b32 s52, v21, 11
+; GFX10_3-NEXT: v_readlane_b32 s51, v21, 10
+; GFX10_3-NEXT: v_readlane_b32 s50, v21, 9
+; GFX10_3-NEXT: v_readlane_b32 s49, v21, 8
+; GFX10_3-NEXT: v_readlane_b32 s48, v21, 7
+; GFX10_3-NEXT: v_readlane_b32 s39, v21, 6
+; GFX10_3-NEXT: v_readlane_b32 s38, v21, 5
+; GFX10_3-NEXT: v_readlane_b32 s37, v21, 4
+; GFX10_3-NEXT: v_readlane_b32 s36, v21, 3
+; GFX10_3-NEXT: v_readlane_b32 s35, v21, 2
+; GFX10_3-NEXT: v_readlane_b32 s34, v21, 1
+; GFX10_3-NEXT: v_readlane_b32 s33, v21, 0
; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80200
; GFX10_3-NEXT: buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -969,24 +970,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX11-NEXT: s_add_i32 s1, s32, 0x4010
; GFX11-NEXT: scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v21, s30, 0
+; GFX11-NEXT: v_writelane_b32 v21, s33, 0
; GFX11-NEXT: s_and_b32 s59, 0, exec_lo
-; GFX11-NEXT: v_writelane_b32 v21, s31, 1
-; GFX11-NEXT: v_writelane_b32 v21, s33, 2
-; GFX11-NEXT: v_writelane_b32 v21, s34, 3
-; GFX11-NEXT: v_writelane_b32 v21, s35, 4
-; GFX11-NEXT: v_writelane_b32 v21, s36, 5
-; GFX11-NEXT: v_writelane_b32 v21, s37, 6
-; GFX11-NEXT: v_writelane_b32 v21, s38, 7
-; GFX11-NEXT: v_writelane_b32 v21, s39, 8
-; GFX11-NEXT: v_writelane_b32 v21, s48, 9
-; GFX11-NEXT: v_writelane_b32 v21, s49, 10
-; GFX11-NEXT: v_writelane_b32 v21, s50, 11
-; GFX11-NEXT: v_writelane_b32 v21, s51, 12
-; GFX11-NEXT: v_writelane_b32 v21, s52, 13
-; GFX11-NEXT: v_writelane_b32 v21, s53, 14
-; GFX11-NEXT: v_writelane_b32 v21, s54, 15
-; GFX11-NEXT: v_writelane_b32 v21, s55, 16
+; GFX11-NEXT: v_writelane_b32 v21, s34, 1
+; GFX11-NEXT: v_writelane_b32 v21, s35, 2
+; GFX11-NEXT: v_writelane_b32 v21, s36, 3
+; GFX11-NEXT: v_writelane_b32 v21, s37, 4
+; GFX11-NEXT: v_writelane_b32 v21, s38, 5
+; GFX11-NEXT: v_writelane_b32 v21, s39, 6
+; GFX11-NEXT: v_writelane_b32 v21, s48, 7
+; GFX11-NEXT: v_writelane_b32 v21, s49, 8
+; GFX11-NEXT: v_writelane_b32 v21, s50, 9
+; GFX11-NEXT: v_writelane_b32 v21, s51, 10
+; GFX11-NEXT: v_writelane_b32 v21, s52, 11
+; GFX11-NEXT: v_writelane_b32 v21, s53, 12
+; GFX11-NEXT: v_writelane_b32 v21, s54, 13
+; GFX11-NEXT: v_writelane_b32 v21, s55, 14
+; GFX11-NEXT: v_writelane_b32 v21, s30, 15
+; GFX11-NEXT: v_writelane_b32 v21, s31, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX11-NEXT: ;;#ASMEND
@@ -999,23 +1000,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v21, 16
-; GFX11-NEXT: v_readlane_b32 s54, v21, 15
-; GFX11-NEXT: v_readlane_b32 s53, v21, 14
-; GFX11-NEXT: v_readlane_b32 s52, v21, 13
-; GFX11-NEXT: v_readlane_b32 s51, v21, 12
-; GFX11-NEXT: v_readlane_b32 s50, v21, 11
-; GFX11-NEXT: v_readlane_b32 s49, v21, 10
-; GFX11-NEXT: v_readlane_b32 s48, v21, 9
-; GFX11-NEXT: v_readlane_b32 s39, v21, 8
-; GFX11-NEXT: v_readlane_b32 s38, v21, 7
-; GFX11-NEXT: v_readlane_b32 s37, v21, 6
-; GFX11-NEXT: v_readlane_b32 s36, v21, 5
-; GFX11-NEXT: v_readlane_b32 s35, v21, 4
-; GFX11-NEXT: v_readlane_b32 s34, v21, 3
-; GFX11-NEXT: v_readlane_b32 s33, v21, 2
-; GFX11-NEXT: v_readlane_b32 s31, v21, 1
-; GFX11-NEXT: v_readlane_b32 s30, v21, 0
+; GFX11-NEXT: v_readlane_b32 s30, v21, 15
+; GFX11-NEXT: v_readlane_b32 s31, v21, 16
+; GFX11-NEXT: v_readlane_b32 s55, v21, 14
+; GFX11-NEXT: v_readlane_b32 s54, v21, 13
+; GFX11-NEXT: v_readlane_b32 s53, v21, 12
+; GFX11-NEXT: v_readlane_b32 s52, v21, 11
+; GFX11-NEXT: v_readlane_b32 s51, v21, 10
+; GFX11-NEXT: v_readlane_b32 s50, v21, 9
+; GFX11-NEXT: v_readlane_b32 s49, v21, 8
+; GFX11-NEXT: v_readlane_b32 s48, v21, 7
+; GFX11-NEXT: v_readlane_b32 s39, v21, 6
+; GFX11-NEXT: v_readlane_b32 s38, v21, 5
+; GFX11-NEXT: v_readlane_b32 s37, v21, 4
+; GFX11-NEXT: v_readlane_b32 s36, v21, 3
+; GFX11-NEXT: v_readlane_b32 s35, v21, 2
+; GFX11-NEXT: v_readlane_b32 s34, v21, 1
+; GFX11-NEXT: v_readlane_b32 s33, v21, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: s_add_i32 s1, s32, 0x4010
; GFX11-NEXT: scratch_load_b32 v21, off, s1 ; 4-byte Folded Reload
@@ -1034,24 +1035,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v21, s30, 0
+; GFX12-NEXT: v_writelane_b32 v21, s33, 0
; GFX12-NEXT: s_and_b32 s59, 0, exec_lo
-; GFX12-NEXT: v_writelane_b32 v21, s31, 1
-; GFX12-NEXT: v_writelane_b32 v21, s33, 2
-; GFX12-NEXT: v_writelane_b32 v21, s34, 3
-; GFX12-NEXT: v_writelane_b32 v21, s35, 4
-; GFX12-NEXT: v_writelane_b32 v21, s36, 5
-; GFX12-NEXT: v_writelane_b32 v21, s37, 6
-; GFX12-NEXT: v_writelane_b32 v21, s38, 7
-; GFX12-NEXT: v_writelane_b32 v21, s39, 8
-; GFX12-NEXT: v_writelane_b32 v21, s48, 9
-; GFX12-NEXT: v_writelane_b32 v21, s49, 10
-; GFX12-NEXT: v_writelane_b32 v21, s50, 11
-; GFX12-NEXT: v_writelane_b32 v21, s51, 12
-; GFX12-NEXT: v_writelane_b32 v21, s52, 13
-; GFX12-NEXT: v_writelane_b32 v21, s53, 14
-; GFX12-NEXT: v_writelane_b32 v21, s54, 15
-; GFX12-NEXT: v_writelane_b32 v21, s55, 16
+; GFX12-NEXT: v_writelane_b32 v21, s34, 1
+; GFX12-NEXT: v_writelane_b32 v21, s35, 2
+; GFX12-NEXT: v_writelane_b32 v21, s36, 3
+; GFX12-NEXT: v_writelane_b32 v21, s37, 4
+; GFX12-NEXT: v_writelane_b32 v21, s38, 5
+; GFX12-NEXT: v_writelane_b32 v21, s39, 6
+; GFX12-NEXT: v_writelane_b32 v21, s48, 7
+; GFX12-NEXT: v_writelane_b32 v21, s49, 8
+; GFX12-NEXT: v_writelane_b32 v21, s50, 9
+; GFX12-NEXT: v_writelane_b32 v21, s51, 10
+; GFX12-NEXT: v_writelane_b32 v21, s52, 11
+; GFX12-NEXT: v_writelane_b32 v21, s53, 12
+; GFX12-NEXT: v_writelane_b32 v21, s54, 13
+; GFX12-NEXT: v_writelane_b32 v21, s55, 14
+; GFX12-NEXT: v_writelane_b32 v21, s30, 15
+; GFX12-NEXT: v_writelane_b32 v21, s31, 16
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX12-NEXT: ;;#ASMEND
@@ -1061,23 +1062,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readlane_b32 s55, v21, 16
-; GFX12-NEXT: v_readlane_b32 s54, v21, 15
-; GFX12-NEXT: v_readlane_b32 s53, v21, 14
-; GFX12-NEXT: v_readlane_b32 s52, v21, 13
-; GFX12-NEXT: v_readlane_b32 s51, v21, 12
-; GFX12-NEXT: v_readlane_b32 s50, v21, 11
-; GFX12-NEXT: v_readlane_b32 s49, v21, 10
-; GFX12-NEXT: v_readlane_b32 s48, v21, 9
-; GFX12-NEXT: v_readlane_b32 s39, v21, 8
-; GFX12-NEXT: v_readlane_b32 s38, v21, 7
-; GFX12-NEXT: v_readlane_b32 s37, v21, 6
-; GFX12-NEXT: v_readlane_b32 s36, v21, 5
-; GFX12-NEXT: v_readlane_b32 s35, v21, 4
-; GFX12-NEXT: v_readlane_b32 s34, v21, 3
-; GFX12-NEXT: v_readlane_b32 s33, v21, 2
-; GFX12-NEXT: v_readlane_b32 s31, v21, 1
-; GFX12-NEXT: v_readlane_b32 s30, v21, 0
+; GFX12-NEXT: v_readlane_b32 s30, v21, 15
+; GFX12-NEXT: v_readlane_b32 s31, v21, 16
+; GFX12-NEXT: v_readlane_b32 s55, v21, 14
+; GFX12-NEXT: v_readlane_b32 s54, v21, 13
+; GFX12-NEXT: v_readlane_b32 s53, v21, 12
+; GFX12-NEXT: v_readlane_b32 s52, v21, 11
+; GFX12-NEXT: v_readlane_b32 s51, v21, 10
+; GFX12-NEXT: v_readlane_b32 s50, v21, 9
+; GFX12-NEXT: v_readlane_b32 s49, v21, 8
+; GFX12-NEXT: v_readlane_b32 s48, v21, 7
+; GFX12-NEXT: v_readlane_b32 s39, v21, 6
+; GFX12-NEXT: v_readlane_b32 s38, v21, 5
+; GFX12-NEXT: v_readlane_b32 s37, v21, 4
+; GFX12-NEXT: v_readlane_b32 s36, v21, 3
+; GFX12-NEXT: v_readlane_b32 s35, v21, 2
+; GFX12-NEXT: v_readlane_b32 s34, v21, 1
+; GFX12-NEXT: v_readlane_b32 s33, v21, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -1135,30 +1136,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: v_writelane_b32 v23, s28, 17
; GFX7-NEXT: v_writelane_b32 v23, s29, 18
-; GFX7-NEXT: v_writelane_b32 v23, s30, 0
-; GFX7-NEXT: v_writelane_b32 v23, s31, 1
-; GFX7-NEXT: v_writelane_b32 v23, s33, 2
-; GFX7-NEXT: v_writelane_b32 v23, s34, 3
-; GFX7-NEXT: v_writelane_b32 v23, s35, 4
-; GFX7-NEXT: v_writelane_b32 v23, s36, 5
-; GFX7-NEXT: v_writelane_b32 v23, s37, 6
-; GFX7-NEXT: v_writelane_b32 v23, s38, 7
-; GFX7-NEXT: v_writelane_b32 v23, s39, 8
-; GFX7-NEXT: v_writelane_b32 v23, s48, 9
-; GFX7-NEXT: v_writelane_b32 v23, s49, 10
-; GFX7-NEXT: v_writelane_b32 v23, s50, 11
-; GFX7-NEXT: v_writelane_b32 v23, s51, 12
-; GFX7-NEXT: v_writelane_b32 v23, s52, 13
+; GFX7-NEXT: v_writelane_b32 v23, s33, 0
+; GFX7-NEXT: v_writelane_b32 v23, s34, 1
+; GFX7-NEXT: v_writelane_b32 v23, s35, 2
+; GFX7-NEXT: v_writelane_b32 v23, s36, 3
+; GFX7-NEXT: v_writelane_b32 v23, s37, 4
+; GFX7-NEXT: v_writelane_b32 v23, s38, 5
+; GFX7-NEXT: v_writelane_b32 v23, s39, 6
+; GFX7-NEXT: v_writelane_b32 v23, s48, 7
+; GFX7-NEXT: v_writelane_b32 v23, s49, 8
+; GFX7-NEXT: v_writelane_b32 v23, s50, 9
+; GFX7-NEXT: v_writelane_b32 v23, s51, 10
+; GFX7-NEXT: v_writelane_b32 v23, s52, 11
+; GFX7-NEXT: v_writelane_b32 v23, s53, 12
+; GFX7-NEXT: v_writelane_b32 v23, s54, 13
; GFX7-NEXT: s_lshr_b32 s5, s32, 6
-; GFX7-NEXT: v_writelane_b32 v23, s53, 14
+; GFX7-NEXT: v_writelane_b32 v23, s55, 14
; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
; GFX7-NEXT: s_add_i32 s4, s5, 0x4240
; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT: v_writelane_b32 v23, s54, 15
+; GFX7-NEXT: v_writelane_b32 v23, s30, 15
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0
; GFX7-NEXT: v_writelane_b32 v22, s4, 0
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_writelane_b32 v23, s55, 16
+; GFX7-NEXT: v_writelane_b32 v23, s31, 16
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use alloca0 v0
; GFX7-NEXT: ;;#ASMEND
@@ -1169,23 +1170,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_readlane_b32 s55, v23, 16
-; GFX7-NEXT: v_readlane_b32 s54, v23, 15
-; GFX7-NEXT: v_readlane_b32 s53, v23, 14
-; GFX7-NEXT: v_readlane_b32 s52, v23, 13
-; GFX7-NEXT: v_readlane_b32 s51, v23, 12
-; GFX7-NEXT: v_readlane_b32 s50, v23, 11
-; GFX7-NEXT: v_readlane_b32 s49, v23, 10
-; GFX7-NEXT: v_readlane_b32 s48, v23, 9
-; GFX7-NEXT: v_readlane_b32 s39, v23, 8
-; GFX7-NEXT: v_readlane_b32 s38, v23, 7
-; GFX7-NEXT: v_readlane_b32 s37, v23, 6
-; GFX7-NEXT: v_readlane_b32 s36, v23, 5
-; GFX7-NEXT: v_readlane_b32 s35, v23, 4
-; GFX7-NEXT: v_readlane_b32 s34, v23, 3
-; GFX7-NEXT: v_readlane_b32 s33, v23, 2
-; GFX7-NEXT: v_readlane_b32 s31, v23, 1
-; GFX7-NEXT: v_readlane_b32 s30, v23, 0
+; GFX7-NEXT: v_readlane_b32 s30, v23, 15
+; GFX7-NEXT: v_readlane_b32 s31, v23, 16
+; GFX7-NEXT: v_readlane_b32 s55, v23, 14
+; GFX7-NEXT: v_readlane_b32 s54, v23, 13
+; GFX7-NEXT: v_readlane_b32 s53, v23, 12
+; GFX7-NEXT: v_readlane_b32 s52, v23, 11
+; GFX7-NEXT: v_readlane_b32 s51, v23, 10
+; GFX7-NEXT: v_readlane_b32 s50, v23, 9
+; GFX7-NEXT: v_readlane_b32 s49, v23, 8
+; GFX7-NEXT: v_readlane_b32 s48, v23, 7
+; GFX7-NEXT: v_readlane_b32 s39, v23, 6
+; GFX7-NEXT: v_readlane_b32 s38, v23, 5
+; GFX7-NEXT: v_readlane_b32 s37, v23, 4
+; GFX7-NEXT: v_readlane_b32 s36, v23, 3
+; GFX7-NEXT: v_readlane_b32 s35, v23, 2
+; GFX7-NEXT: v_readlane_b32 s34, v23, 1
+; GFX7-NEXT: v_readlane_b32 s33, v23, 0
; GFX7-NEXT: v_readlane_b32 s28, v23, 17
; GFX7-NEXT: v_readlane_b32 s29, v23, 18
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
@@ -1206,30 +1207,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: s_add_i32 s6, s32, 0x201100
; GFX8-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v23, s30, 0
-; GFX8-NEXT: v_writelane_b32 v23, s31, 1
-; GFX8-NEXT: v_writelane_b32 v23, s33, 2
-; GFX8-NEXT: v_writelane_b32 v23, s34, 3
-; GFX8-NEXT: v_writelane_b32 v23, s35, 4
-; GFX8-NEXT: v_writelane_b32 v23, s36, 5
-; GFX8-NEXT: v_writelane_b32 v23, s37, 6
-; GFX8-NEXT: v_writelane_b32 v23, s38, 7
-; GFX8-NEXT: v_writelane_b32 v23, s39, 8
-; GFX8-NEXT: v_writelane_b32 v23, s48, 9
-; GFX8-NEXT: v_writelane_b32 v23, s49, 10
-; GFX8-NEXT: v_writelane_b32 v23, s50, 11
-; GFX8-NEXT: v_writelane_b32 v23, s51, 12
-; GFX8-NEXT: v_writelane_b32 v23, s52, 13
+; GFX8-NEXT: v_writelane_b32 v23, s33, 0
+; GFX8-NEXT: v_writelane_b32 v23, s34, 1
+; GFX8-NEXT: v_writelane_b32 v23, s35, 2
+; GFX8-NEXT: v_writelane_b32 v23, s36, 3
+; GFX8-NEXT: v_writelane_b32 v23, s37, 4
+; GFX8-NEXT: v_writelane_b32 v23, s38, 5
+; GFX8-NEXT: v_writelane_b32 v23, s39, 6
+; GFX8-NEXT: v_writelane_b32 v23, s48, 7
+; GFX8-NEXT: v_writelane_b32 v23, s49, 8
+; GFX8-NEXT: v_writelane_b32 v23, s50, 9
+; GFX8-NEXT: v_writelane_b32 v23, s51, 10
+; GFX8-NEXT: v_writelane_b32 v23, s52, 11
+; GFX8-NEXT: v_writelane_b32 v23, s53, 12
+; GFX8-NEXT: v_writelane_b32 v23, s54, 13
; GFX8-NEXT: s_lshr_b32 s5, s32, 6
-; GFX8-NEXT: v_writelane_b32 v23, s53, 14
+; GFX8-NEXT: v_writelane_b32 v23, s55, 14
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: s_add_i32 s4, s5, 0x4240
; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT: v_writelane_b32 v23, s54, 15
+; GFX8-NEXT: v_writelane_b32 v23, s30, 15
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: v_writelane_b32 v22, s4, 0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v23, s55, 16
+; GFX8-NEXT: v_writelane_b32 v23, s31, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
@@ -1241,23 +1242,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v23, 16
-; GFX8-NEXT: v_readlane_b32 s54, v23, 15
-; GFX8-NEXT: v_readlane_b32 s53, v23, 14
-; GFX8-NEXT: v_readlane_b32 s52, v23, 13
-; GFX8-NEXT: v_readlane_b32 s51, v23, 12
-; GFX8-NEXT: v_readlane_b32 s50, v23, 11
-; GFX8-NEXT: v_readlane_b32 s49, v23, 10
-; GFX8-NEXT: v_readlane_b32 s48, v23, 9
-; GFX8-NEXT: v_readlane_b32 s39, v23, 8
-; GFX8-NEXT: v_readlane_b32 s38, v23, 7
-; GFX8-NEXT: v_readlane_b32 s37, v23, 6
-; GFX8-NEXT: v_readlane_b32 s36, v23, 5
-; GFX8-NEXT: v_readlane_b32 s35, v23, 4
-; GFX8-NEXT: v_readlane_b32 s34, v23, 3
-; GFX8-NEXT: v_readlane_b32 s33, v23, 2
-; GFX8-NEXT: v_readlane_b32 s31, v23, 1
-; GFX8-NEXT: v_readlane_b32 s30, v23, 0
+; GFX8-NEXT: v_readlane_b32 s30, v23, 15
+; GFX8-NEXT: v_readlane_b32 s31, v23, 16
+; GFX8-NEXT: v_readlane_b32 s55, v23, 14
+; GFX8-NEXT: v_readlane_b32 s54, v23, 13
+; GFX8-NEXT: v_readlane_b32 s53, v23, 12
+; GFX8-NEXT: v_readlane_b32 s52, v23, 11
+; GFX8-NEXT: v_readlane_b32 s51, v23, 10
+; GFX8-NEXT: v_readlane_b32 s50, v23, 9
+; GFX8-NEXT: v_readlane_b32 s49, v23, 8
+; GFX8-NEXT: v_readlane_b32 s48, v23, 7
+; GFX8-NEXT: v_readlane_b32 s39, v23, 6
+; GFX8-NEXT: v_readlane_b32 s38, v23, 5
+; GFX8-NEXT: v_readlane_b32 s37, v23, 4
+; GFX8-NEXT: v_readlane_b32 s36, v23, 3
+; GFX8-NEXT: v_readlane_b32 s35, v23, 2
+; GFX8-NEXT: v_readlane_b32 s34, v23, 1
+; GFX8-NEXT: v_readlane_b32 s33, v23, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1276,30 +1277,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX900-NEXT: s_add_i32 s6, s32, 0x201100
; GFX900-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v23, s30, 0
-; GFX900-NEXT: v_writelane_b32 v23, s31, 1
-; GFX900-NEXT: v_writelane_b32 v23, s33, 2
-; GFX900-NEXT: v_writelane_b32 v23, s34, 3
-; GFX900-NEXT: v_writelane_b32 v23, s35, 4
-; GFX900-NEXT: v_writelane_b32 v23, s36, 5
-; GFX900-NEXT: v_writelane_b32 v23, s37, 6
-; GFX900-NEXT: v_writelane_b32 v23, s38, 7
-; GFX900-NEXT: v_writelane_b32 v23, s39, 8
-; GFX900-NEXT: v_writelane_b32 v23, s48, 9
-; GFX900-NEXT: v_writelane_b32 v23, s49, 10
-; GFX900-NEXT: v_writelane_b32 v23, s50, 11
-; GFX900-NEXT: v_writelane_b32 v23, s51, 12
-; GFX900-NEXT: v_writelane_b32 v23, s52, 13
+; GFX900-NEXT: v_writelane_b32 v23, s33, 0
+; GFX900-NEXT: v_writelane_b32 v23, s34, 1
+; GFX900-NEXT: v_writelane_b32 v23, s35, 2
+; GFX900-NEXT: v_writelane_b32 v23, s36, 3
+; GFX900-NEXT: v_writelane_b32 v23, s37, 4
+; GFX900-NEXT: v_writelane_b32 v23, s38, 5
+; GFX900-NEXT: v_writelane_b32 v23, s39, 6
+; GFX900-NEXT: v_writelane_b32 v23, s48, 7
+; GFX900-NEXT: v_writelane_b32 v23, s49, 8
+; GFX900-NEXT: v_writelane_b32 v23, s50, 9
+; GFX900-NEXT: v_writelane_b32 v23, s51, 10
+; GFX900-NEXT: v_writelane_b32 v23, s52, 11
+; GFX900-NEXT: v_writelane_b32 v23, s53, 12
+; GFX900-NEXT: v_writelane_b32 v23, s54, 13
; GFX900-NEXT: s_lshr_b32 s5, s32, 6
-; GFX900-NEXT: v_writelane_b32 v23, s53, 14
+; GFX900-NEXT: v_writelane_b32 v23, s55, 14
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: s_add_i32 s4, s5, 0x4240
; GFX900-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX900-NEXT: v_writelane_b32 v23, s54, 15
+; GFX900-NEXT: v_writelane_b32 v23, s30, 15
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: v_writelane_b32 v22, s4, 0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v23, s55, 16
+; GFX900-NEXT: v_writelane_b32 v23, s31, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
@@ -1311,23 +1312,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v23, 16
-; GFX900-NEXT: v_readlane_b32 s54, v23, 15
-; GFX900-NEXT: v_readlane_b32 s53, v23, 14
-; GFX900-NEXT: v_readlane_b32 s52, v23, 13
-; GFX900-NEXT: v_readlane_b32 s51, v23, 12
-; GFX900-NEXT: v_readlane_b32 s50, v23, 11
-; GFX900-NEXT: v_readlane_b32 s49, v23, 10
-; GFX900-NEXT: v_readlane_b32 s48, v23, 9
-; GFX900-NEXT: v_readlane_b32 s39, v23, 8
-; GFX900-NEXT: v_readlane_b32 s38, v23, 7
-; GFX900-NEXT: v_readlane_b32 s37, v23, 6
-; GFX900-NEXT: v_readlane_b32 s36, v23, 5
-; GFX900-NEXT: v_readlane_b32 s35, v23, 4
-; GFX900-NEXT: v_readlane_b32 s34, v23, 3
-; GFX900-NEXT: v_readlane_b32 s33, v23, 2
-; GFX900-NEXT: v_readlane_b32 s31, v23, 1
-; GFX900-NEXT: v_readlane_b32 s30, v23, 0
+; GFX900-NEXT: v_readlane_b32 s30, v23, 15
+; GFX900-NEXT: v_readlane_b32 s31, v23, 16
+; GFX900-NEXT: v_readlane_b32 s55, v23, 14
+; GFX900-NEXT: v_readlane_b32 s54, v23, 13
+; GFX900-NEXT: v_readlane_b32 s53, v23, 12
+; GFX900-NEXT: v_readlane_b32 s52, v23, 11
+; GFX900-NEXT: v_readlane_b32 s51, v23, 10
+; GFX900-NEXT: v_readlane_b32 s50, v23, 9
+; GFX900-NEXT: v_readlane_b32 s49, v23, 8
+; GFX900-NEXT: v_readlane_b32 s48, v23, 7
+; GFX900-NEXT: v_readlane_b32 s39, v23, 6
+; GFX900-NEXT: v_readlane_b32 s38, v23, 5
+; GFX900-NEXT: v_readlane_b32 s37, v23, 4
+; GFX900-NEXT: v_readlane_b32 s36, v23, 3
+; GFX900-NEXT: v_readlane_b32 s35, v23, 2
+; GFX900-NEXT: v_readlane_b32 s34, v23, 1
+; GFX900-NEXT: v_readlane_b32 s33, v23, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
@@ -1344,28 +1345,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX942-NEXT: s_add_i32 s2, s32, 0x8040
; GFX942-NEXT: scratch_store_dword off, v22, s2 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: v_writelane_b32 v22, s30, 0
-; GFX942-NEXT: v_writelane_b32 v22, s31, 1
-; GFX942-NEXT: v_writelane_b32 v22, s33, 2
-; GFX942-NEXT: v_writelane_b32 v22, s34, 3
-; GFX942-NEXT: v_writelane_b32 v22, s35, 4
-; GFX942-NEXT: v_writelane_b32 v22, s36, 5
-; GFX942-NEXT: v_writelane_b32 v22, s37, 6
-; GFX942-NEXT: v_writelane_b32 v22, s38, 7
-; GFX942-NEXT: v_writelane_b32 v22, s39, 8
-; GFX942-NEXT: v_writelane_b32 v22, s48, 9
-; GFX942-NEXT: v_writelane_b32 v22, s49, 10
-; GFX942-NEXT: v_writelane_b32 v22, s50, 11
-; GFX942-NEXT: v_writelane_b32 v22, s51, 12
-; GFX942-NEXT: v_writelane_b32 v22, s52, 13
-; GFX942-NEXT: v_writelane_b32 v22, s53, 14
+; GFX942-NEXT: v_writelane_b32 v22, s33, 0
+; GFX942-NEXT: v_writelane_b32 v22, s34, 1
+; GFX942-NEXT: v_writelane_b32 v22, s35, 2
+; GFX942-NEXT: v_writelane_b32 v22, s36, 3
+; GFX942-NEXT: v_writelane_b32 v22, s37, 4
+; GFX942-NEXT: v_writelane_b32 v22, s38, 5
+; GFX942-NEXT: v_writelane_b32 v22, s39, 6
+; GFX942-NEXT: v_writelane_b32 v22, s48, 7
+; GFX942-NEXT: v_writelane_b32 v22, s49, 8
+; GFX942-NEXT: v_writelane_b32 v22, s50, 9
+; GFX942-NEXT: v_writelane_b32 v22, s51, 10
+; GFX942-NEXT: v_writelane_b32 v22, s52, 11
+; GFX942-NEXT: v_writelane_b32 v22, s53, 12
+; GFX942-NEXT: v_writelane_b32 v22, s54, 13
+; GFX942-NEXT: v_writelane_b32 v22, s55, 14
; GFX942-NEXT: s_add_i32 s0, s32, 64
-; GFX942-NEXT: v_writelane_b32 v22, s54, 15
+; GFX942-NEXT: v_writelane_b32 v22, s30, 15
; GFX942-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-NEXT: v_writelane_b32 v22, s55, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_writelane_b32 v22, s31, 16
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX942-NEXT: ;;#ASMEND
@@ -1376,23 +1378,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v22, 16
-; GFX942-NEXT: v_readlane_b32 s54, v22, 15
-; GFX942-NEXT: v_readlane_b32 s53, v22, 14
-; GFX942-NEXT: v_readlane_b32 s52, v22, 13
-; GFX942-NEXT: v_readlane_b32 s51, v22, 12
-; GFX942-NEXT: v_readlane_b32 s50, v22, 11
-; GFX942-NEXT: v_readlane_b32 s49, v22, 10
-; GFX942-NEXT: v_readlane_b32 s48, v22, 9
-; GFX942-NEXT: v_readlane_b32 s39, v22, 8
-; GFX942-NEXT: v_readlane_b32 s38, v22, 7
-; GFX942-NEXT: v_readlane_b32 s37, v22, 6
-; GFX942-NEXT: v_readlane_b32 s36, v22, 5
-; GFX942-NEXT: v_readlane_b32 s35, v22, 4
-; GFX942-NEXT: v_readlane_b32 s34, v22, 3
-; GFX942-NEXT: v_readlane_b32 s33, v22, 2
-; GFX942-NEXT: v_readlane_b32 s31, v22, 1
-; GFX942-NEXT: v_readlane_b32 s30, v22, 0
+; GFX942-NEXT: v_readlane_b32 s30, v22, 15
+; GFX942-NEXT: v_readlane_b32 s31, v22, 16
+; GFX942-NEXT: v_readlane_b32 s55, v22, 14
+; GFX942-NEXT: v_readlane_b32 s54, v22, 13
+; GFX942-NEXT: v_readlane_b32 s53, v22, 12
+; GFX942-NEXT: v_readlane_b32 s52, v22, 11
+; GFX942-NEXT: v_readlane_b32 s51, v22, 10
+; GFX942-NEXT: v_readlane_b32 s50, v22, 9
+; GFX942-NEXT: v_readlane_b32 s49, v22, 8
+; GFX942-NEXT: v_readlane_b32 s48, v22, 7
+; GFX942-NEXT: v_readlane_b32 s39, v22, 6
+; GFX942-NEXT: v_readlane_b32 s38, v22, 5
+; GFX942-NEXT: v_readlane_b32 s37, v22, 4
+; GFX942-NEXT: v_readlane_b32 s36, v22, 3
+; GFX942-NEXT: v_readlane_b32 s35, v22, 2
+; GFX942-NEXT: v_readlane_b32 s34, v22, 1
+; GFX942-NEXT: v_readlane_b32 s33, v22, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: s_add_i32 s2, s32, 0x8040
; GFX942-NEXT: scratch_load_dword v22, off, s2 ; 4-byte Folded Reload
@@ -1408,31 +1410,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v22, s30, 0
+; GFX10_1-NEXT: v_writelane_b32 v22, s33, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5
; GFX10_1-NEXT: s_add_i32 s58, s4, 0x4240
-; GFX10_1-NEXT: v_writelane_b32 v22, s31, 1
+; GFX10_1-NEXT: v_writelane_b32 v22, s34, 1
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_writelane_b32 v22, s33, 2
-; GFX10_1-NEXT: v_writelane_b32 v22, s34, 3
-; GFX10_1-NEXT: v_writelane_b32 v22, s35, 4
-; GFX10_1-NEXT: v_writelane_b32 v22, s36, 5
-; GFX10_1-NEXT: v_writelane_b32 v22, s37, 6
-; GFX10_1-NEXT: v_writelane_b32 v22, s38, 7
-; GFX10_1-NEXT: v_writelane_b32 v22, s39, 8
-; GFX10_1-NEXT: v_writelane_b32 v22, s48, 9
-; GFX10_1-NEXT: v_writelane_b32 v22, s49, 10
-; GFX10_1-NEXT: v_writelane_b32 v22, s50, 11
-; GFX10_1-NEXT: v_writelane_b32 v22, s51, 12
-; GFX10_1-NEXT: v_writelane_b32 v22, s52, 13
-; GFX10_1-NEXT: v_writelane_b32 v22, s53, 14
-; GFX10_1-NEXT: v_writelane_b32 v22, s54, 15
-; GFX10_1-NEXT: v_writelane_b32 v22, s55, 16
+; GFX10_1-NEXT: v_writelane_b32 v22, s35, 2
+; GFX10_1-NEXT: v_writelane_b32 v22, s36, 3
+; GFX10_1-NEXT: v_writelane_b32 v22, s37, 4
+; GFX10_1-NEXT: v_writelane_b32 v22, s38, 5
+; GFX10_1-NEXT: v_writelane_b32 v22, s39, 6
+; GFX10_1-NEXT: v_writelane_b32 v22, s48, 7
+; GFX10_1-NEXT: v_writelane_b32 v22, s49, 8
+; GFX10_1-NEXT: v_writelane_b32 v22, s50, 9
+; GFX10_1-NEXT: v_writelane_b32 v22, s51, 10
+; GFX10_1-NEXT: v_writelane_b32 v22, s52, 11
+; GFX10_1-NEXT: v_writelane_b32 v22, s53, 12
+; GFX10_1-NEXT: v_writelane_b32 v22, s54, 13
+; GFX10_1-NEXT: v_writelane_b32 v22, s55, 14
+; GFX10_1-NEXT: v_writelane_b32 v22, s30, 15
+; GFX10_1-NEXT: v_writelane_b32 v22, s31, 16
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX10_1-NEXT: ;;#ASMEND
@@ -1441,23 +1443,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v22, 16
-; GFX10_1-NEXT: v_readlane_b32 s54, v22, 15
-; GFX10_1-NEXT: v_readlane_b32 s53, v22, 14
-; GFX10_1-NEXT: v_readlane_b32 s52, v22, 13
-; GFX10_1-NEXT: v_readlane_b32 s51, v22, 12
-; GFX10_1-NEXT: v_readlane_b32 s50, v22, 11
-; GFX10_1-NEXT: v_readlane_b32 s49, v22, 10
-; GFX10_1-NEXT: v_readlane_b32 s48, v22, 9
-; GFX10_1-NEXT: v_readlane_b32 s39, v22, 8
-; GFX10_1-NEXT: v_readlane_b32 s38, v22, 7
-; GFX10_1-NEXT: v_readlane_b32 s37, v22, 6
-; GFX10_1-NEXT: v_readlane_b32 s36, v22, 5
-; GFX10_1-NEXT: v_readlane_b32 s35, v22, 4
-; GFX10_1-NEXT: v_readlane_b32 s34, v22, 3
-; GFX10_1-NEXT: v_readlane_b32 s33, v22, 2
-; GFX10_1-NEXT: v_readlane_b32 s31, v22, 1
-; GFX10_1-NEXT: v_readlane_b32 s30, v22, 0
+; GFX10_1-NEXT: v_readlane_b32 s30, v22, 15
+; GFX10_1-NEXT: v_readlane_b32 s31, v22, 16
+; GFX10_1-NEXT: v_readlane_b32 s55, v22, 14
+; GFX10_1-NEXT: v_readlane_b32 s54, v22, 13
+; GFX10_1-NEXT: v_readlane_b32 s53, v22, 12
+; GFX10_1-NEXT: v_readlane_b32 s52, v22, 11
+; GFX10_1-NEXT: v_readlane_b32 s51, v22, 10
+; GFX10_1-NEXT: v_readlane_b32 s50, v22, 9
+; GFX10_1-NEXT: v_readlane_b32 s49, v22, 8
+; GFX10_1-NEXT: v_readlane_b32 s48, v22, 7
+; GFX10_1-NEXT: v_readlane_b32 s39, v22, 6
+; GFX10_1-NEXT: v_readlane_b32 s38, v22, 5
+; GFX10_1-NEXT: v_readlane_b32 s37, v22, 4
+; GFX10_1-NEXT: v_readlane_b32 s36, v22, 3
+; GFX10_1-NEXT: v_readlane_b32 s35, v22, 2
+; GFX10_1-NEXT: v_readlane_b32 s34, v22, 1
+; GFX10_1-NEXT: v_readlane_b32 s33, v22, 0
; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800
; GFX10_1-NEXT: buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1473,31 +1475,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
; GFX10_3-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v22, s30, 0
+; GFX10_3-NEXT: v_writelane_b32 v22, s33, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5
; GFX10_3-NEXT: s_add_i32 s58, s4, 0x4240
-; GFX10_3-NEXT: v_writelane_b32 v22, s31, 1
+; GFX10_3-NEXT: v_writelane_b32 v22, s34, 1
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_writelane_b32 v22, s33, 2
-; GFX10_3-NEXT: v_writelane_b32 v22, s34, 3
-; GFX10_3-NEXT: v_writelane_b32 v22, s35, 4
-; GFX10_3-NEXT: v_writelane_b32 v22, s36, 5
-; GFX10_3-NEXT: v_writelane_b32 v22, s37, 6
-; GFX10_3-NEXT: v_writelane_b32 v22, s38, 7
-; GFX10_3-NEXT: v_writelane_b32 v22, s39, 8
-; GFX10_3-NEXT: v_writelane_b32 v22, s48, 9
-; GFX10_3-NEXT: v_writelane_b32 v22, s49, 10
-; GFX10_3-NEXT: v_writelane_b32 v22, s50, 11
-; GFX10_3-NEXT: v_writelane_b32 v22, s51, 12
-; GFX10_3-NEXT: v_writelane_b32 v22, s52, 13
-; GFX10_3-NEXT: v_writelane_b32 v22, s53, 14
-; GFX10_3-NEXT: v_writelane_b32 v22, s54, 15
-; GFX10_3-NEXT: v_writelane_b32 v22, s55, 16
+; GFX10_3-NEXT: v_writelane_b32 v22, s35, 2
+; GFX10_3-NEXT: v_writelane_b32 v22, s36, 3
+; GFX10_3-NEXT: v_writelane_b32 v22, s37, 4
+; GFX10_3-NEXT: v_writelane_b32 v22, s38, 5
+; GFX10_3-NEXT: v_writelane_b32 v22, s39, 6
+; GFX10_3-NEXT: v_writelane_b32 v22, s48, 7
+; GFX10_3-NEXT: v_writelane_b32 v22, s49, 8
+; GFX10_3-NEXT: v_writelane_b32 v22, s50, 9
+; GFX10_3-NEXT: v_writelane_b32 v22, s51, 10
+; GFX10_3-NEXT: v_writelane_b32 v22, s52, 11
+; GFX10_3-NEXT: v_writelane_b32 v22, s53, 12
+; GFX10_3-NEXT: v_writelane_b32 v22, s54, 13
+; GFX10_3-NEXT: v_writelane_b32 v22, s55, 14
+; GFX10_3-NEXT: v_writelane_b32 v22, s30, 15
+; GFX10_3-NEXT: v_writelane_b32 v22, s31, 16
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX10_3-NEXT: ;;#ASMEND
@@ -1506,23 +1508,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v22, 16
-; GFX10_3-NEXT: v_readlane_b32 s54, v22, 15
-; GFX10_3-NEXT: v_readlane_b32 s53, v22, 14
-; GFX10_3-NEXT: v_readlane_b32 s52, v22, 13
-; GFX10_3-NEXT: v_readlane_b32 s51, v22, 12
-; GFX10_3-NEXT: v_readlane_b32 s50, v22, 11
-; GFX10_3-NEXT: v_readlane_b32 s49, v22, 10
-; GFX10_3-NEXT: v_readlane_b32 s48, v22, 9
-; GFX10_3-NEXT: v_readlane_b32 s39, v22, 8
-; GFX10_3-NEXT: v_readlane_b32 s38, v22, 7
-; GFX10_3-NEXT: v_readlane_b32 s37, v22, 6
-; GFX10_3-NEXT: v_readlane_b32 s36, v22, 5
-; GFX10_3-NEXT: v_readlane_b32 s35, v22, 4
-; GFX10_3-NEXT: v_readlane_b32 s34, v22, 3
-; GFX10_3-NEXT: v_readlane_b32 s33, v22, 2
-; GFX10_3-NEXT: v_readlane_b32 s31, v22, 1
-; GFX10_3-NEXT: v_readlane_b32 s30, v22, 0
+; GFX10_3-NEXT: v_readlane_b32 s30, v22, 15
+; GFX10_3-NEXT: v_readlane_b32 s31, v22, 16
+; GFX10_3-NEXT: v_readlane_b32 s55, v22, 14
+; GFX10_3-NEXT: v_readlane_b32 s54, v22, 13
+; GFX10_3-NEXT: v_readlane_b32 s53, v22, 12
+; GFX10_3-NEXT: v_readlane_b32 s52, v22, 11
+; GFX10_3-NEXT: v_readlane_b32 s51, v22, 10
+; GFX10_3-NEXT: v_readlane_b32 s50, v22, 9
+; GFX10_3-NEXT: v_readlane_b32 s49, v22, 8
+; GFX10_3-NEXT: v_readlane_b32 s48, v22, 7
+; GFX10_3-NEXT: v_readlane_b32 s39, v22, 6
+; GFX10_3-NEXT: v_readlane_b32 s38, v22, 5
+; GFX10_3-NEXT: v_readlane_b32 s37, v22, 4
+; GFX10_3-NEXT: v_readlane_b32 s36, v22, 3
+; GFX10_3-NEXT: v_readlane_b32 s35, v22, 2
+; GFX10_3-NEXT: v_readlane_b32 s34, v22, 1
+; GFX10_3-NEXT: v_readlane_b32 s33, v22, 0
; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
; GFX10_3-NEXT: buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload
@@ -1537,30 +1539,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
; GFX11-NEXT: scratch_store_b32 off, v22, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v22, s30, 0
+; GFX11-NEXT: v_writelane_b32 v22, s33, 0
; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: s_add_i32 s58, s32, 0x4240
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_writelane_b32 v22, s31, 1
+; GFX11-NEXT: v_writelane_b32 v22, s34, 1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_writelane_b32 v22, s33, 2
-; GFX11-NEXT: v_writelane_b32 v22, s34, 3
-; GFX11-NEXT: v_writelane_b32 v22, s35, 4
-; GFX11-NEXT: v_writelane_b32 v22, s36, 5
-; GFX11-NEXT: v_writelane_b32 v22, s37, 6
-; GFX11-NEXT: v_writelane_b32 v22, s38, 7
-; GFX11-NEXT: v_writelane_b32 v22, s39, 8
-; GFX11-NEXT: v_writelane_b32 v22, s48, 9
-; GFX11-NEXT: v_writelane_b32 v22, s49, 10
-; GFX11-NEXT: v_writelane_b32 v22, s50, 11
-; GFX11-NEXT: v_writelane_b32 v22, s51, 12
-; GFX11-NEXT: v_writelane_b32 v22, s52, 13
-; GFX11-NEXT: v_writelane_b32 v22, s53, 14
-; GFX11-NEXT: v_writelane_b32 v22, s54, 15
-; GFX11-NEXT: v_writelane_b32 v22, s55, 16
+; GFX11-NEXT: v_writelane_b32 v22, s35, 2
+; GFX11-NEXT: v_writelane_b32 v22, s36, 3
+; GFX11-NEXT: v_writelane_b32 v22, s37, 4
+; GFX11-NEXT: v_writelane_b32 v22, s38, 5
+; GFX11-NEXT: v_writelane_b32 v22, s39, 6
+; GFX11-NEXT: v_writelane_b32 v22, s48, 7
+; GFX11-NEXT: v_writelane_b32 v22, s49, 8
+; GFX11-NEXT: v_writelane_b32 v22, s50, 9
+; GFX11-NEXT: v_writelane_b32 v22, s51, 10
+; GFX11-NEXT: v_writelane_b32 v22, s52, 11
+; GFX11-NEXT: v_writelane_b32 v22, s53, 12
+; GFX11-NEXT: v_writelane_b32 v22, s54, 13
+; GFX11-NEXT: v_writelane_b32 v22, s55, 14
+; GFX11-NEXT: v_writelane_b32 v22, s30, 15
+; GFX11-NEXT: v_writelane_b32 v22, s31, 16
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX11-NEXT: ;;#ASMEND
@@ -1570,23 +1572,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s55, v22, 16
-; GFX11-NEXT: v_readlane_b32 s54, v22, 15
-; GFX11-NEXT: v_readlane_b32 s53, v22, 14
-; GFX11-NEXT: v_readlane_b32 s52, v22, 13
-; GFX11-NEXT: v_readlane_b32 s51, v22, 12
-; GFX11-NEXT: v_readlane_b32 s50, v22, 11
-; GFX11-NEXT: v_readlane_b32 s49, v22, 10
-; GFX11-NEXT: v_readlane_b32 s48, v22, 9
-; GFX11-NEXT: v_readlane_b32 s39, v22, 8
-; GFX11-NEXT: v_readlane_b32 s38, v22, 7
-; GFX11-NEXT: v_readlane_b32 s37, v22, 6
-; GFX11-NEXT: v_readlane_b32 s36, v22, 5
-; GFX11-NEXT: v_readlane_b32 s35, v22, 4
-; GFX11-NEXT: v_readlane_b32 s34, v22, 3
-; GFX11-NEXT: v_readlane_b32 s33, v22, 2
-; GFX11-NEXT: v_readlane_b32 s31, v22, 1
-; GFX11-NEXT: v_readlane_b32 s30, v22, 0
+; GFX11-NEXT: v_readlane_b32 s30, v22, 15
+; GFX11-NEXT: v_readlane_b32 s31, v22, 16
+; GFX11-NEXT: v_readlane_b32 s55, v22, 14
+; GFX11-NEXT: v_readlane_b32 s54, v22, 13
+; GFX11-NEXT: v_readlane_b32 s53, v22, 12
+; GFX11-NEXT: v_readlane_b32 s52, v22, 11
+; GFX11-NEXT: v_readlane_b32 s51, v22, 10
+; GFX11-NEXT: v_readlane_b32 s50, v22, 9
+; GFX11-NEXT: v_readlane_b32 s49, v22, 8
+; GFX11-NEXT: v_readlane_b32 s48, v22, 7
+; GFX11-NEXT: v_readlane_b32 s39, v22, 6
+; GFX11-NEXT: v_readlane_b32 s38, v22, 5
+; GFX11-NEXT: v_readlane_b32 s37, v22, 4
+; GFX11-NEXT: v_readlane_b32 s36, v22, 3
+; GFX11-NEXT: v_readlane_b32 s35, v22, 2
+; GFX11-NEXT: v_readlane_b32 s34, v22, 1
+; GFX11-NEXT: v_readlane_b32 s33, v22, 0
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
; GFX11-NEXT: scratch_load_b32 v22, off, s1 ; 4-byte Folded Reload
@@ -1605,29 +1607,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: scratch_store_b32 off, v22, s32 offset:32768 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v22, s30, 0
+; GFX12-NEXT: v_writelane_b32 v22, s33, 0
; GFX12-NEXT: s_add_co_i32 s58, s32, 0x4200
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_writelane_b32 v22, s31, 1
-; GFX12-NEXT: v_writelane_b32 v22, s33, 2
-; GFX12-NEXT: v_writelane_b32 v22, s34, 3
-; GFX12-NEXT: v_writelane_b32 v22, s35, 4
-; GFX12-NEXT: v_writelane_b32 v22, s36, 5
-; GFX12-NEXT: v_writelane_b32 v22, s37, 6
-; GFX12-NEXT: v_writelane_b32 v22, s38, 7
-; GFX12-NEXT: v_writelane_b32 v22, s39, 8
-; GFX12-NEXT: v_writelane_b32 v22, s48, 9
-; GFX12-NEXT: v_writelane_b32 v22, s49, 10
-; GFX12-NEXT: v_writelane_b32 v22, s50, 11
-; GFX12-NEXT: v_writelane_b32 v22, s51, 12
-; GFX12-NEXT: v_writelane_b32 v22, s52, 13
-; GFX12-NEXT: v_writelane_b32 v22, s53, 14
-; GFX12-NEXT: v_writelane_b32 v22, s54, 15
-; GFX12-NEXT: v_writelane_b32 v22, s55, 16
+; GFX12-NEXT: v_writelane_b32 v22, s34, 1
+; GFX12-NEXT: v_writelane_b32 v22, s35, 2
+; GFX12-NEXT: v_writelane_b32 v22, s36, 3
+; GFX12-NEXT: v_writelane_b32 v22, s37, 4
+; GFX12-NEXT: v_writelane_b32 v22, s38, 5
+; GFX12-NEXT: v_writelane_b32 v22, s39, 6
+; GFX12-NEXT: v_writelane_b32 v22, s48, 7
+; GFX12-NEXT: v_writelane_b32 v22, s49, 8
+; GFX12-NEXT: v_writelane_b32 v22, s50, 9
+; GFX12-NEXT: v_writelane_b32 v22, s51, 10
+; GFX12-NEXT: v_writelane_b32 v22, s52, 11
+; GFX12-NEXT: v_writelane_b32 v22, s53, 12
+; GFX12-NEXT: v_writelane_b32 v22, s54, 13
+; GFX12-NEXT: v_writelane_b32 v22, s55, 14
+; GFX12-NEXT: v_writelane_b32 v22, s30, 15
+; GFX12-NEXT: v_writelane_b32 v22, s31, 16
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX12-NEXT: ;;#ASMEND
@@ -1637,23 +1639,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v22, 16
-; GFX12-NEXT: v_readlane_b32 s54, v22, 15
-; GFX12-NEXT: v_readlane_b32 s53, v22, 14
-; GFX12-NEXT: v_readlane_b32 s52, v22, 13
-; GFX12-NEXT: v_readlane_b32 s51, v22, 12
-; GFX12-NEXT: v_readlane_b32 s50, v22, 11
-; GFX12-NEXT: v_readlane_b32 s49, v22, 10
-; GFX12-NEXT: v_readlane_b32 s48, v22, 9
-; GFX12-NEXT: v_readlane_b32 s39, v22, 8
-; GFX12-NEXT: v_readlane_b32 s38, v22, 7
-; GFX12-NEXT: v_readlane_b32 s37, v22, 6
-; GFX12-NEXT: v_readlane_b32 s36, v22, 5
-; GFX12-NEXT: v_readlane_b32 s35, v22, 4
-; GFX12-NEXT: v_readlane_b32 s34, v22, 3
-; GFX12-NEXT: v_readlane_b32 s33, v22, 2
-; GFX12-NEXT: v_readlane_b32 s31, v22, 1
-; GFX12-NEXT: v_readlane_b32 s30, v22, 0
+; GFX12-NEXT: v_readlane_b32 s30, v22, 15
+; GFX12-NEXT: v_readlane_b32 s31, v22, 16
+; GFX12-NEXT: v_readlane_b32 s55, v22, 14
+; GFX12-NEXT: v_readlane_b32 s54, v22, 13
+; GFX12-NEXT: v_readlane_b32 s53, v22, 12
+; GFX12-NEXT: v_readlane_b32 s52, v22, 11
+; GFX12-NEXT: v_readlane_b32 s51, v22, 10
+; GFX12-NEXT: v_readlane_b32 s50, v22, 9
+; GFX12-NEXT: v_readlane_b32 s49, v22, 8
+; GFX12-NEXT: v_readlane_b32 s48, v22, 7
+; GFX12-NEXT: v_readlane_b32 s39, v22, 6
+; GFX12-NEXT: v_readlane_b32 s38, v22, 5
+; GFX12-NEXT: v_readlane_b32 s37, v22, 4
+; GFX12-NEXT: v_readlane_b32 s36, v22, 3
+; GFX12-NEXT: v_readlane_b32 s35, v22, 2
+; GFX12-NEXT: v_readlane_b32 s34, v22, 1
+; GFX12-NEXT: v_readlane_b32 s33, v22, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v22, off, s32 offset:32768 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 0c0919de4aedc..e95a726ee5df6 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -194,22 +194,22 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: v_writelane_b32 v43, s4, 5
-; GFX9-NEXT: v_writelane_b32 v43, s30, 0
-; GFX9-NEXT: v_writelane_b32 v43, s31, 1
; GFX9-NEXT: s_addk_i32 s32, 0x800
-; GFX9-NEXT: v_writelane_b32 v43, s34, 2
-; GFX9-NEXT: v_writelane_b32 v43, s36, 3
+; GFX9-NEXT: v_writelane_b32 v43, s34, 0
+; GFX9-NEXT: v_writelane_b32 v43, s36, 1
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v43, s37, 4
+; GFX9-NEXT: v_writelane_b32 v43, s37, 2
; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, v1
; GFX9-NEXT: v_mov_b32_e32 v41, v0
+; GFX9-NEXT: v_writelane_b32 v43, s30, 3
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
+; GFX9-NEXT: v_writelane_b32 v43, s31, 4
; GFX9-NEXT: s_mov_b32 s34, s15
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -224,11 +224,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s37, v43, 4
-; GFX9-NEXT: v_readlane_b32 s36, v43, 3
-; GFX9-NEXT: v_readlane_b32 s34, v43, 2
-; GFX9-NEXT: v_readlane_b32 s31, v43, 1
-; GFX9-NEXT: v_readlane_b32 s30, v43, 0
+; GFX9-NEXT: v_readlane_b32 s30, v43, 3
+; GFX9-NEXT: v_readlane_b32 s31, v43, 4
+; GFX9-NEXT: v_readlane_b32 s37, v43, 2
+; GFX9-NEXT: v_readlane_b32 s36, v43, 1
+; GFX9-NEXT: v_readlane_b32 s34, v43, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v43, 5
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 65446a036c91b..878302e4865bb 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -47,8 +47,8 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; clobber csr v40
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: v_readlane_b32 s30, v1, 0
+; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
@@ -190,8 +190,8 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: v_readlane_b32 s30, v1, 0
+; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -224,8 +224,8 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v2, 1
; CHECK-NEXT: v_readlane_b32 s30, v2, 0
+; CHECK-NEXT: v_readlane_b32 s31, v2, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index ccaf0ac5377e4..8394b325bee6d 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -29,8 +29,8 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -68,8 +68,8 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 6b6c60ebe2a9e..133cc166c3311 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -247,8 +247,8 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: .Ltmp1:
; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
index 28bab355c359d..478a3194709b3 100644
--- a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
@@ -35,9 +35,9 @@ define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_mov_b32_e32 v0, v2
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_readlane_b32 s30, v4, 0
; CHECK-NEXT: v_min_f32_e32 v0, v3, v0
; CHECK-NEXT: v_readlane_b32 s31, v4, 1
-; CHECK-NEXT: v_readlane_b32 s30, v4, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -85,10 +85,10 @@ define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: v_mov_b32_e32 v0, v3
; CHECK-NEXT: v_mov_b32_e32 v1, v2
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_readlane_b32 s30, v6, 0
; CHECK-NEXT: v_min_f32_e32 v0, v4, v0
; CHECK-NEXT: v_min_f32_e32 v1, v5, v1
; CHECK-NEXT: v_readlane_b32 s31, v6, 1
-; CHECK-NEXT: v_readlane_b32 s30, v6, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -136,9 +136,9 @@ define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) {
; CHECK-NEXT: v_mov_b32_e32 v0, v5
; CHECK-NEXT: v_mov_b32_e32 v1, v4
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_readlane_b32 s30, v6, 0
; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
; CHECK-NEXT: v_readlane_b32 s31, v6, 1
-; CHECK-NEXT: v_readlane_b32 s30, v6, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -209,8 +209,8 @@ define nofpclass(nan inf) { double, double } @aggregate() {
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -247,10 +247,10 @@ define { float, float } @aggregate_use(float %z) {
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_max_f32_e32 v2, v40, v40
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s30, v41, 0
; CHECK-NEXT: v_min_f32_e32 v0, v0, v2
; CHECK-NEXT: v_min_f32_e32 v1, v1, v2
; CHECK-NEXT: v_readlane_b32 s31, v41, 1
-; CHECK-NEXT: v_readlane_b32 s30, v41, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v41, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -317,13 +317,13 @@ define <5 x double> @call_nofpclass_funcs_v5f64_non_mvt_vector(ptr addrspace(1)
; CHECK-NEXT: v_mov_b32_e32 v20, v8
; CHECK-NEXT: v_mov_b32_e32 v21, v9
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_readlane_b32 s30, v24, 0
; CHECK-NEXT: v_min_f64 v[0:1], v[12:13], v[0:1]
; CHECK-NEXT: v_min_f64 v[2:3], v[14:15], v[2:3]
; CHECK-NEXT: v_min_f64 v[4:5], v[16:17], v[4:5]
; CHECK-NEXT: v_min_f64 v[6:7], v[18:19], v[6:7]
; CHECK-NEXT: v_min_f64 v[8:9], v[20:21], v[8:9]
; CHECK-NEXT: v_readlane_b32 s31, v24, 1
-; CHECK-NEXT: v_readlane_b32 s30, v24, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v24, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 1521ad5219174..6fefed6e07f2d 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -338,8 +338,8 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT: v_readlane_b32 s31, v41, 1
; GFX906-NEXT: v_readlane_b32 s30, v41, 0
+; GFX906-NEXT: v_readlane_b32 s31, v41, 1
; GFX906-NEXT: s_mov_b32 s32, s33
; GFX906-NEXT: v_readlane_b32 s4, v41, 4
; GFX906-NEXT: v_readlane_b32 s34, v41, 2
@@ -398,21 +398,14 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_addk_i32 s32, 0x2c00
; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX908-NEXT: s_mov_b64 s[16:17], exec
-; GFX908-NEXT: s_mov_b64 exec, 1
+; GFX908-NEXT: s_mov_b64 exec, 3
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:168
; GFX908-NEXT: v_writelane_b32 v2, s30, 0
+; GFX908-NEXT: v_writelane_b32 v2, s31, 1
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:168
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[16:17]
-; GFX908-NEXT: s_mov_b64 s[16:17], exec
-; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:168
-; GFX908-NEXT: v_writelane_b32 v2, s31, 0
-; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:168
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_mov_b64 exec, s[16:17]
; GFX908-NEXT: s_mov_b32 s21, s15
; GFX908-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
; GFX908-NEXT: s_mov_b32 s22, s14
@@ -755,20 +748,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168
-; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_readlane_b32 s31, v0, 0
-; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_mov_b64 exec, s[4:5]
-; GFX908-NEXT: s_mov_b64 s[4:5], exec
-; GFX908-NEXT: s_mov_b64 exec, 1
+; GFX908-NEXT: s_mov_b64 exec, 3
; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readlane_b32 s30, v0, 0
+; GFX908-NEXT: v_readlane_b32 s31, v0, 1
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 9d8a54b4cc178..0f0274ccba346 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -20,9 +20,9 @@ define void @test_remat_s_getpc_b64() {
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
-; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -45,8 +45,8 @@ define void @test_remat_s_getpc_b64() {
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
@@ -79,8 +79,8 @@ define void @test_remat_s_getpc_b64() {
; GFX12-NEXT: s_sext_i32_i16 s1, s1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_readlane_b32 s31, v2, 1
; GFX12-NEXT: v_readlane_b32 s30, v2, 0
+; GFX12-NEXT: v_readlane_b32 s31, v2, 1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 03b88858c7318..74719d5037db9 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,16 +28,16 @@ body: |
; GCN-LABEL: name: test_main
; GCN: bb.0:
; GCN-NEXT: successors: %bb.1(0x80000000)
- ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0
+ ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vcc_hi = frame-setup COPY $sgpr33
; GCN-NEXT: $sgpr33 = frame-setup COPY $sgpr32
; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.73, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.68, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.69, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.70, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.71, addrspace 5)
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.72, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2
@@ -66,48 +66,48 @@ body: |
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2
; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr2
- ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr2
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr3
- ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr3
- ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr4
- ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr4
- ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr4
- ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr4
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 26, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 27, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 28, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 29, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 30, $vgpr2
+ ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 31, $vgpr2
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 0, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 1, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 2, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 3, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 4, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 5, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 6, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 7, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 8, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 9, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 10, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 11, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 12, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 13, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 14, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 15, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 16, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 17, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 18, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 19, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 20, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 21, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 22, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 23, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 24, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 25, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 26, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 27, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 28, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 29, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 30, $vgpr3
+ ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 31, $vgpr3
+ ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 0, $vgpr4
+ ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 1, $vgpr4
+ ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr30, 2, $vgpr4, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr31, 3, $vgpr4, implicit $sgpr30_sgpr31
; GCN-NEXT: $sgpr22 = IMPLICIT_DEF
; GCN-NEXT: $vgpr5 = IMPLICIT_DEF
; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
@@ -130,48 +130,48 @@ body: |
; GCN-NEXT: bb.3:
; GCN-NEXT: liveins: $vcc_hi
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
- ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
- ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
- ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
- ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
- ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
- ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
- ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
- ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
- ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
- ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
- ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
- ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
- ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
- ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
- ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
- ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
- ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
- ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
- ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
- ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
- ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
- ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
- ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
- ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
- ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
- ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
- ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
- ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
- ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
- ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
- ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
- ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
- ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
- ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
- ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
- ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
- ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
- ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
- ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
- ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
- ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
+ ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2, implicit-def $sgpr30_sgpr31
+ ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
+ ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
+ ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
+ ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
+ ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
+ ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
+ ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
+ ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
+ ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
+ ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
+ ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
+ ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
+ ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
+ ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
+ ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
+ ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
+ ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
+ ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
+ ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
+ ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
+ ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
+ ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
+ ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
+ ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
+ ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
+ ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
+ ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
+ ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
+ ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
+ ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
+ ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+ ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
+ ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
+ ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
+ ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
+ ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
+ ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
+ ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
+ ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
+ ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
+ ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
; GCN-NEXT: $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25
; GCN-NEXT: $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24
; GCN-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23
@@ -200,11 +200,11 @@ body: |
; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
; GCN-NEXT: $sgpr32 = frame-destroy COPY $sgpr33
; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.69, addrspace 5)
- ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.70, addrspace 5)
- ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.71, addrspace 5)
- ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.72, addrspace 5)
- ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.73, addrspace 5)
+ ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.68, addrspace 5)
+ ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.69, addrspace 5)
+ ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.70, addrspace 5)
+ ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.71, addrspace 5)
+ ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" load (s32) from %stack.72, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi
; GCN-NEXT: S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 702953c56a5cb..cb54b0ba629c3 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -152,8 +152,8 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v255, 1
; GCN-NEXT: v_readlane_b32 s30, v255, 0
+; GCN-NEXT: v_readlane_b32 s31, v255, 1
; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -445,8 +445,8 @@ define void @spill_to_lowest_available_vgpr() #0 {
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v254, 1
; GCN-NEXT: v_readlane_b32 s30, v254, 0
+; GCN-NEXT: v_readlane_b32 s31, v254, 1
; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -1632,21 +1632,14 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 s[16:17], exec
-; GCN-NEXT: s_mov_b64 exec, 1
+; GCN-NEXT: s_mov_b64 exec, 3
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456
; GCN-NEXT: v_writelane_b32 v0, s30, 0
+; GCN-NEXT: v_writelane_b32 v0, s31, 1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 exec, s[16:17]
-; GCN-NEXT: s_mov_b64 s[16:17], exec
-; GCN-NEXT: s_mov_b64 exec, 1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT: v_writelane_b32 v0, s31, 0
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, child_function_ipra at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, child_function_ipra at rel32@hi+12
@@ -1656,20 +1649,12 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b64 exec, 1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v0, 0
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b64 exec, 1
+; GCN-NEXT: s_mov_b64 exec, 3
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s30, v0, 0
+; GCN-NEXT: v_readlane_b32 s31, v0, 1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index 1c2215d39dc02..feaca47f98e36 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -14610,13 +14610,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_mov_b32 s8, s30
; GFX900-NEXT: s_mov_b32 s9, s31
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s10, s12
; GFX900-NEXT: s_mov_b32 s11, s13
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -14639,13 +14639,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b32 s8, s30
; GFX90A-NEXT: s_mov_b32 s9, s31
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s10, s12
; GFX90A-NEXT: s_mov_b32 s11, s13
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -14750,13 +14750,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_mov_b32 s8, s30
; GFX900-NEXT: s_mov_b32 s9, s31
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s10, s12
; GFX900-NEXT: s_mov_b32 s11, s13
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -14779,13 +14779,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b32 s8, s30
; GFX90A-NEXT: s_mov_b32 s9, s31
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s10, s12
; GFX90A-NEXT: s_mov_b32 s11, s13
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -14802,19 +14802,19 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s30
; GFX942-NEXT: s_mov_b32 s9, s31
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -14845,12 +14845,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
; GFX900-NEXT: s_mov_b32 s12, s30
; GFX900-NEXT: s_mov_b32 s13, s31
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -14874,12 +14874,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
; GFX90A-NEXT: s_mov_b32 s12, s30
; GFX90A-NEXT: s_mov_b32 s13, s31
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -14999,22 +14999,22 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s30
; GFX942-NEXT: s_mov_b32 s9, s31
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -15120,6 +15120,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -15127,12 +15128,12 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
; GFX942-NEXT: s_mov_b32 s12, s30
; GFX942-NEXT: s_mov_b32 s13, s31
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -16167,20 +16168,21 @@ define void @s_shuffle_v2i64_v8i64__12_0() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s16
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s17
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -16890,20 +16892,21 @@ define void @s_shuffle_v2i64_v8i64__12_1() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s10, s18
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s19
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -17481,6 +17484,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s18
; GFX900-NEXT: s_mov_b32 s9, s19
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -17489,7 +17493,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -17510,6 +17513,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s18
; GFX90A-NEXT: s_mov_b32 s9, s19
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -17518,7 +17522,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -17565,13 +17568,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s10, s20
; GFX900-NEXT: s_mov_b32 s11, s21
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -17592,13 +17595,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s10, s20
; GFX90A-NEXT: s_mov_b32 s11, s21
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -17612,6 +17615,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -17620,13 +17624,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s10, s20
; GFX942-NEXT: s_mov_b32 s11, s21
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -17654,6 +17658,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s22
; GFX900-NEXT: s_mov_b32 s9, s23
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -17662,7 +17667,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -17683,6 +17687,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s22
; GFX90A-NEXT: s_mov_b32 s9, s23
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -17691,7 +17696,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -17798,6 +17802,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s26
; GFX900-NEXT: s_mov_b32 s9, s27
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -17806,7 +17811,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -17827,6 +17831,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s26
; GFX90A-NEXT: s_mov_b32 s9, s27
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -17835,7 +17840,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -18315,13 +18319,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s10, s22
; GFX900-NEXT: s_mov_b32 s11, s23
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -18342,13 +18346,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s10, s22
; GFX90A-NEXT: s_mov_b32 s11, s23
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -18362,6 +18366,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -18370,13 +18375,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s10, s22
; GFX942-NEXT: s_mov_b32 s11, s23
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -18950,6 +18955,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s18
; GFX900-NEXT: s_mov_b32 s9, s19
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -18958,7 +18964,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -18979,6 +18984,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s18
; GFX90A-NEXT: s_mov_b32 s9, s19
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -18987,7 +18993,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19004,19 +19009,19 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -19100,6 +19105,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s22
; GFX900-NEXT: s_mov_b32 s9, s23
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -19108,7 +19114,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19129,6 +19134,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s22
; GFX90A-NEXT: s_mov_b32 s9, s23
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -19137,7 +19143,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19154,19 +19159,19 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s22
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s23
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -19197,12 +19202,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
; GFX900-NEXT: s_mov_b32 s26, s12
; GFX900-NEXT: s_mov_b32 s27, s13
; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19226,12 +19231,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
; GFX90A-NEXT: s_mov_b32 s26, s12
; GFX90A-NEXT: s_mov_b32 s27, s13
; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19276,6 +19281,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s26
; GFX900-NEXT: s_mov_b32 s9, s27
; GFX900-NEXT: s_mov_b32 s10, s12
@@ -19284,7 +19290,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19305,6 +19310,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s26
; GFX90A-NEXT: s_mov_b32 s9, s27
; GFX90A-NEXT: s_mov_b32 s10, s12
@@ -19313,7 +19319,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19330,19 +19335,19 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s26
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s27
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -19374,11 +19379,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
; GFX900-NEXT: s_mov_b32 s31, s13
; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19403,11 +19408,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
; GFX90A-NEXT: s_mov_b32 s31, s13
; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -19874,12 +19879,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
; GFX900-NEXT: s_mov_b32 s12, s18
; GFX900-NEXT: s_mov_b32 s13, s19
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -19903,12 +19908,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
; GFX90A-NEXT: s_mov_b32 s12, s18
; GFX90A-NEXT: s_mov_b32 s13, s19
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20012,12 +20017,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
; GFX900-NEXT: s_mov_b32 s12, s22
; GFX900-NEXT: s_mov_b32 s13, s23
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -20041,12 +20046,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
; GFX90A-NEXT: s_mov_b32 s12, s22
; GFX90A-NEXT: s_mov_b32 s13, s23
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20094,12 +20099,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
; GFX900-NEXT: s_mov_b32 s26, s14
; GFX900-NEXT: s_mov_b32 s27, s15
; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -20123,12 +20128,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
; GFX90A-NEXT: s_mov_b32 s26, s14
; GFX90A-NEXT: s_mov_b32 s27, s15
; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20176,12 +20181,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
; GFX900-NEXT: s_mov_b32 s12, s26
; GFX900-NEXT: s_mov_b32 s13, s27
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -20205,12 +20210,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
; GFX90A-NEXT: s_mov_b32 s12, s26
; GFX90A-NEXT: s_mov_b32 s13, s27
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20259,11 +20264,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
; GFX900-NEXT: s_mov_b32 s31, s15
; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -20288,11 +20293,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
; GFX90A-NEXT: s_mov_b32 s31, s15
; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -20846,22 +20851,22 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -21020,22 +21025,22 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s22
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s23
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -21244,22 +21249,22 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s10, s12
+; GFX942-NEXT: s_mov_b32 s11, s13
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s26
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s27
-; GFX942-NEXT: s_mov_b32 s10, s12
-; GFX942-NEXT: s_mov_b32 s11, s13
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -21362,10 +21367,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -21373,11 +21379,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
; GFX942-NEXT: s_mov_b32 s31, s13
; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -21909,6 +21915,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -21916,12 +21923,12 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
; GFX942-NEXT: s_mov_b32 s12, s18
; GFX942-NEXT: s_mov_b32 s13, s19
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -22083,6 +22090,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -22090,12 +22098,12 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
; GFX942-NEXT: s_mov_b32 s12, s22
; GFX942-NEXT: s_mov_b32 s13, s23
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -22307,6 +22315,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -22314,12 +22323,12 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
; GFX942-NEXT: s_mov_b32 s12, s26
; GFX942-NEXT: s_mov_b32 s13, s27
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -22422,10 +22431,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
@@ -22433,11 +22443,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
; GFX942-NEXT: s_mov_b32 s31, s15
; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -23434,12 +23444,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
; GFX900-NEXT: s_mov_b32 s14, s18
; GFX900-NEXT: s_mov_b32 s15, s19
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -23463,12 +23473,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
; GFX90A-NEXT: s_mov_b32 s14, s18
; GFX90A-NEXT: s_mov_b32 s15, s19
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -23513,13 +23523,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[8:23]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s26
; GFX900-NEXT: s_mov_b32 s9, s27
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -23540,13 +23550,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[8:23]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s26
; GFX90A-NEXT: s_mov_b32 s9, s27
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -23560,6 +23570,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -23568,13 +23579,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[8:23]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s8, s26
; GFX942-NEXT: s_mov_b32 s9, s27
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -23680,6 +23691,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -23687,12 +23699,12 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
; GFX942-NEXT: s_mov_b32 s14, s18
; GFX942-NEXT: s_mov_b32 s15, s19
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -24284,12 +24296,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
; GFX900-NEXT: s_mov_b32 s14, s20
; GFX900-NEXT: s_mov_b32 s15, s21
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -24313,12 +24325,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
; GFX90A-NEXT: s_mov_b32 s14, s20
; GFX90A-NEXT: s_mov_b32 s15, s21
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -24363,6 +24375,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s14
; GFX900-NEXT: s_mov_b32 s9, s15
; GFX900-NEXT: s_mov_b32 s10, s20
@@ -24371,7 +24384,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -24392,6 +24404,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s14
; GFX90A-NEXT: s_mov_b32 s9, s15
; GFX90A-NEXT: s_mov_b32 s10, s20
@@ -24400,7 +24413,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -24524,6 +24536,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -24531,12 +24544,12 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
; GFX942-NEXT: s_mov_b32 s14, s20
; GFX942-NEXT: s_mov_b32 s15, s21
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -24639,22 +24652,22 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: s_mov_b32 s9, s15
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s8, s14
-; GFX942-NEXT: s_mov_b32 s9, s15
; GFX942-NEXT: s_mov_b32 s10, s20
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s21
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -25235,13 +25248,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s22
; GFX900-NEXT: s_mov_b32 s9, s23
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -25262,13 +25275,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s22
; GFX90A-NEXT: s_mov_b32 s9, s23
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -25282,6 +25295,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -25290,13 +25304,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[4:19]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s8, s22
; GFX942-NEXT: s_mov_b32 s9, s23
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -25327,12 +25341,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
; GFX900-NEXT: s_mov_b32 s14, s22
; GFX900-NEXT: s_mov_b32 s15, s23
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -25356,12 +25370,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
; GFX90A-NEXT: s_mov_b32 s14, s22
; GFX90A-NEXT: s_mov_b32 s15, s23
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -25540,6 +25554,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -25547,12 +25562,12 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
; GFX942-NEXT: s_mov_b32 s14, s22
; GFX942-NEXT: s_mov_b32 s15, s23
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -26144,12 +26159,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
; GFX900-NEXT: s_mov_b32 s14, s24
; GFX900-NEXT: s_mov_b32 s15, s25
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -26173,12 +26188,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
; GFX90A-NEXT: s_mov_b32 s14, s24
; GFX90A-NEXT: s_mov_b32 s15, s25
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -26223,6 +26238,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s14
; GFX900-NEXT: s_mov_b32 s9, s15
; GFX900-NEXT: s_mov_b32 s10, s24
@@ -26231,7 +26247,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -26252,6 +26267,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s14
; GFX90A-NEXT: s_mov_b32 s9, s15
; GFX90A-NEXT: s_mov_b32 s10, s24
@@ -26260,7 +26276,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -26384,6 +26399,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -26391,12 +26407,12 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
; GFX942-NEXT: s_mov_b32 s14, s24
; GFX942-NEXT: s_mov_b32 s15, s25
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -26499,22 +26515,22 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: s_mov_b32 s9, s15
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s8, s14
-; GFX942-NEXT: s_mov_b32 s9, s15
; GFX942-NEXT: s_mov_b32 s10, s24
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s25
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -26880,20 +26896,21 @@ define void @s_shuffle_v2i64_v8i64__1_13() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s8, s18
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s9, s19
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -27040,12 +27057,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
; GFX900-NEXT: s_mov_b32 s14, s26
; GFX900-NEXT: s_mov_b32 s15, s27
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -27069,12 +27086,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
; GFX90A-NEXT: s_mov_b32 s14, s26
; GFX90A-NEXT: s_mov_b32 s15, s27
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -27122,12 +27139,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
; GFX900-NEXT: s_mov_b32 s24, s14
; GFX900-NEXT: s_mov_b32 s25, s15
; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -27151,12 +27168,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
; GFX90A-NEXT: s_mov_b32 s24, s14
; GFX90A-NEXT: s_mov_b32 s25, s15
; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -27279,6 +27296,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -27286,12 +27304,12 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
; GFX942-NEXT: s_mov_b32 s14, s26
; GFX942-NEXT: s_mov_b32 s15, s27
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -27997,12 +28015,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
; GFX900-NEXT: s_mov_b32 s14, s28
; GFX900-NEXT: s_mov_b32 s15, s29
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -28026,12 +28044,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
; GFX90A-NEXT: s_mov_b32 s14, s28
; GFX90A-NEXT: s_mov_b32 s15, s29
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -28076,6 +28094,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[16:31]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b32 s8, s14
; GFX900-NEXT: s_mov_b32 s9, s15
; GFX900-NEXT: s_mov_b32 s10, s28
@@ -28084,7 +28103,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -28105,6 +28123,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def s[16:31]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b32 s8, s14
; GFX90A-NEXT: s_mov_b32 s9, s15
; GFX90A-NEXT: s_mov_b32 s10, s28
@@ -28113,7 +28132,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -28237,6 +28255,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -28244,12 +28263,12 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
; GFX942-NEXT: s_mov_b32 s14, s28
; GFX942-NEXT: s_mov_b32 s15, s29
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -28352,22 +28371,22 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s8, s14
+; GFX942-NEXT: s_mov_b32 s9, s15
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_mov_b32 s8, s14
-; GFX942-NEXT: s_mov_b32 s9, s15
; GFX942-NEXT: s_mov_b32 s10, s28
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b32 s11, s29
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -28978,12 +28997,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
; GFX900-NEXT: s_mov_b32 s14, s30
; GFX900-NEXT: s_mov_b32 s15, s31
; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -29007,12 +29026,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
; GFX90A-NEXT: s_mov_b32 s14, s30
; GFX90A-NEXT: s_mov_b32 s15, s31
; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -29061,11 +29080,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
; GFX900-NEXT: s_mov_b32 s29, s15
; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[8:11]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s31, v0, 1
-; GFX900-NEXT: v_readlane_b32 s30, v0, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -29090,11 +29109,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
; GFX90A-NEXT: s_mov_b32 s29, s15
; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s[8:11]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v0, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v0, 0
; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, s[4:5]
@@ -29219,6 +29238,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[16:31]
@@ -29226,12 +29246,12 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
; GFX942-NEXT: s_mov_b32 s14, s30
; GFX942-NEXT: s_mov_b32 s15, s31
; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15]
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
@@ -29334,22 +29354,23 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: v_writelane_b32 v0, s30, 0
-; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[16:31]
+; GFX942-NEXT: ; def s[0:15]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_writelane_b32 v0, s31, 1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def s[0:15]
+; GFX942-NEXT: ; def s[16:31]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_mov_b32 s28, s14
; GFX942-NEXT: s_mov_b32 s29, s15
; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29]
; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use s[8:11]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s31, v0, 1
-; GFX942-NEXT: v_readlane_b32 s30, v0, 0
; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
index 1ffef8e60d90d..ea67593d72761 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir
@@ -24,10 +24,10 @@ machineFunctionInfo:
body: |
bb.0:
; SGPR_SPILLED-LABEL: name: stack-slot-share-equal-sized-spills
- ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+ ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: {{ $}}
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
@@ -89,10 +89,10 @@ machineFunctionInfo:
body: |
bb.0:
; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-large-spill-first
- ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+ ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: {{ $}}
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3
@@ -152,10 +152,10 @@ machineFunctionInfo:
body: |
bb.0:
; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-small-spill-first
- ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62
+ ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: {{ $}}
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62
- ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+ ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]]
; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index ec940d9d0955f..d9d2a99c3e02d 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -242,8 +242,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -425,8 +425,8 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -469,11 +469,11 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3
; GCN-NEXT: v_mov_b32_e32 v1, v40
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: v_readlane_b32 s30, v42, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
; GCN-NEXT: v_readlane_b32 s31, v42, 1
-; GCN-NEXT: v_readlane_b32 s30, v42, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s6, v42, 2
; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -603,23 +603,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; FIJI-NEXT: s_mov_b64 exec, s[18:19]
; FIJI-NEXT: v_writelane_b32 v40, s16, 18
-; FIJI-NEXT: v_writelane_b32 v40, s30, 0
-; FIJI-NEXT: v_writelane_b32 v40, s31, 1
-; FIJI-NEXT: v_writelane_b32 v40, s34, 2
-; FIJI-NEXT: v_writelane_b32 v40, s35, 3
-; FIJI-NEXT: v_writelane_b32 v40, s36, 4
-; FIJI-NEXT: v_writelane_b32 v40, s37, 5
-; FIJI-NEXT: v_writelane_b32 v40, s38, 6
-; FIJI-NEXT: v_writelane_b32 v40, s39, 7
-; FIJI-NEXT: v_writelane_b32 v40, s48, 8
-; FIJI-NEXT: v_writelane_b32 v40, s49, 9
-; FIJI-NEXT: v_writelane_b32 v40, s50, 10
-; FIJI-NEXT: v_writelane_b32 v40, s51, 11
-; FIJI-NEXT: v_writelane_b32 v40, s52, 12
-; FIJI-NEXT: v_writelane_b32 v40, s53, 13
-; FIJI-NEXT: v_writelane_b32 v40, s54, 14
-; FIJI-NEXT: v_writelane_b32 v40, s55, 15
-; FIJI-NEXT: v_writelane_b32 v40, s64, 16
+; FIJI-NEXT: v_writelane_b32 v40, s34, 0
+; FIJI-NEXT: v_writelane_b32 v40, s35, 1
+; FIJI-NEXT: v_writelane_b32 v40, s36, 2
+; FIJI-NEXT: v_writelane_b32 v40, s37, 3
+; FIJI-NEXT: v_writelane_b32 v40, s38, 4
+; FIJI-NEXT: v_writelane_b32 v40, s39, 5
+; FIJI-NEXT: v_writelane_b32 v40, s48, 6
+; FIJI-NEXT: v_writelane_b32 v40, s49, 7
+; FIJI-NEXT: v_writelane_b32 v40, s50, 8
+; FIJI-NEXT: v_writelane_b32 v40, s51, 9
+; FIJI-NEXT: v_writelane_b32 v40, s52, 10
+; FIJI-NEXT: v_writelane_b32 v40, s53, 11
+; FIJI-NEXT: v_writelane_b32 v40, s54, 12
+; FIJI-NEXT: v_writelane_b32 v40, s55, 13
+; FIJI-NEXT: v_writelane_b32 v40, s64, 14
+; FIJI-NEXT: v_writelane_b32 v40, s65, 15
+; FIJI-NEXT: v_writelane_b32 v40, s30, 16
; FIJI-NEXT: s_mov_b32 s50, s15
; FIJI-NEXT: s_mov_b32 s51, s14
; FIJI-NEXT: s_mov_b32 s52, s13
@@ -631,7 +631,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4
; FIJI-NEXT: s_mov_b64 s[54:55], exec
; FIJI-NEXT: s_addk_i32 s32, 0x400
-; FIJI-NEXT: v_writelane_b32 v40, s65, 17
+; FIJI-NEXT: v_writelane_b32 v40, s31, 17
; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; FIJI-NEXT: v_readfirstlane_b32 s16, v0
; FIJI-NEXT: v_readfirstlane_b32 s17, v1
@@ -657,25 +657,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; FIJI-NEXT: s_cbranch_execnz .LBB18_1
; FIJI-NEXT: ; %bb.2:
; FIJI-NEXT: s_mov_b64 exec, s[54:55]
+; FIJI-NEXT: v_readlane_b32 s30, v40, 16
; FIJI-NEXT: v_mov_b32_e32 v0, v4
-; FIJI-NEXT: v_readlane_b32 s65, v40, 17
-; FIJI-NEXT: v_readlane_b32 s64, v40, 16
-; FIJI-NEXT: v_readlane_b32 s55, v40, 15
-; FIJI-NEXT: v_readlane_b32 s54, v40, 14
-; FIJI-NEXT: v_readlane_b32 s53, v40, 13
-; FIJI-NEXT: v_readlane_b32 s52, v40, 12
-; FIJI-NEXT: v_readlane_b32 s51, v40, 11
-; FIJI-NEXT: v_readlane_b32 s50, v40, 10
-; FIJI-NEXT: v_readlane_b32 s49, v40, 9
-; FIJI-NEXT: v_readlane_b32 s48, v40, 8
-; FIJI-NEXT: v_readlane_b32 s39, v40, 7
-; FIJI-NEXT: v_readlane_b32 s38, v40, 6
-; FIJI-NEXT: v_readlane_b32 s37, v40, 5
-; FIJI-NEXT: v_readlane_b32 s36, v40, 4
-; FIJI-NEXT: v_readlane_b32 s35, v40, 3
-; FIJI-NEXT: v_readlane_b32 s34, v40, 2
-; FIJI-NEXT: v_readlane_b32 s31, v40, 1
-; FIJI-NEXT: v_readlane_b32 s30, v40, 0
+; FIJI-NEXT: v_readlane_b32 s31, v40, 17
+; FIJI-NEXT: v_readlane_b32 s65, v40, 15
+; FIJI-NEXT: v_readlane_b32 s64, v40, 14
+; FIJI-NEXT: v_readlane_b32 s55, v40, 13
+; FIJI-NEXT: v_readlane_b32 s54, v40, 12
+; FIJI-NEXT: v_readlane_b32 s53, v40, 11
+; FIJI-NEXT: v_readlane_b32 s52, v40, 10
+; FIJI-NEXT: v_readlane_b32 s51, v40, 9
+; FIJI-NEXT: v_readlane_b32 s50, v40, 8
+; FIJI-NEXT: v_readlane_b32 s49, v40, 7
+; FIJI-NEXT: v_readlane_b32 s48, v40, 6
+; FIJI-NEXT: v_readlane_b32 s39, v40, 5
+; FIJI-NEXT: v_readlane_b32 s38, v40, 4
+; FIJI-NEXT: v_readlane_b32 s37, v40, 3
+; FIJI-NEXT: v_readlane_b32 s36, v40, 2
+; FIJI-NEXT: v_readlane_b32 s35, v40, 1
+; FIJI-NEXT: v_readlane_b32 s34, v40, 0
; FIJI-NEXT: s_mov_b32 s32, s33
; FIJI-NEXT: v_readlane_b32 s4, v40, 18
; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -694,23 +694,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; HAWAII-NEXT: s_mov_b64 exec, s[18:19]
; HAWAII-NEXT: v_writelane_b32 v40, s16, 18
-; HAWAII-NEXT: v_writelane_b32 v40, s30, 0
-; HAWAII-NEXT: v_writelane_b32 v40, s31, 1
-; HAWAII-NEXT: v_writelane_b32 v40, s34, 2
-; HAWAII-NEXT: v_writelane_b32 v40, s35, 3
-; HAWAII-NEXT: v_writelane_b32 v40, s36, 4
-; HAWAII-NEXT: v_writelane_b32 v40, s37, 5
-; HAWAII-NEXT: v_writelane_b32 v40, s38, 6
-; HAWAII-NEXT: v_writelane_b32 v40, s39, 7
-; HAWAII-NEXT: v_writelane_b32 v40, s48, 8
-; HAWAII-NEXT: v_writelane_b32 v40, s49, 9
-; HAWAII-NEXT: v_writelane_b32 v40, s50, 10
-; HAWAII-NEXT: v_writelane_b32 v40, s51, 11
-; HAWAII-NEXT: v_writelane_b32 v40, s52, 12
-; HAWAII-NEXT: v_writelane_b32 v40, s53, 13
-; HAWAII-NEXT: v_writelane_b32 v40, s54, 14
-; HAWAII-NEXT: v_writelane_b32 v40, s55, 15
-; HAWAII-NEXT: v_writelane_b32 v40, s64, 16
+; HAWAII-NEXT: v_writelane_b32 v40, s34, 0
+; HAWAII-NEXT: v_writelane_b32 v40, s35, 1
+; HAWAII-NEXT: v_writelane_b32 v40, s36, 2
+; HAWAII-NEXT: v_writelane_b32 v40, s37, 3
+; HAWAII-NEXT: v_writelane_b32 v40, s38, 4
+; HAWAII-NEXT: v_writelane_b32 v40, s39, 5
+; HAWAII-NEXT: v_writelane_b32 v40, s48, 6
+; HAWAII-NEXT: v_writelane_b32 v40, s49, 7
+; HAWAII-NEXT: v_writelane_b32 v40, s50, 8
+; HAWAII-NEXT: v_writelane_b32 v40, s51, 9
+; HAWAII-NEXT: v_writelane_b32 v40, s52, 10
+; HAWAII-NEXT: v_writelane_b32 v40, s53, 11
+; HAWAII-NEXT: v_writelane_b32 v40, s54, 12
+; HAWAII-NEXT: v_writelane_b32 v40, s55, 13
+; HAWAII-NEXT: v_writelane_b32 v40, s64, 14
+; HAWAII-NEXT: v_writelane_b32 v40, s65, 15
+; HAWAII-NEXT: v_writelane_b32 v40, s30, 16
; HAWAII-NEXT: s_mov_b32 s50, s15
; HAWAII-NEXT: s_mov_b32 s51, s14
; HAWAII-NEXT: s_mov_b32 s52, s13
@@ -722,7 +722,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; HAWAII-NEXT: s_mov_b64 s[54:55], exec
; HAWAII-NEXT: s_addk_i32 s32, 0x400
-; HAWAII-NEXT: v_writelane_b32 v40, s65, 17
+; HAWAII-NEXT: v_writelane_b32 v40, s31, 17
; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; HAWAII-NEXT: v_readfirstlane_b32 s16, v0
; HAWAII-NEXT: v_readfirstlane_b32 s17, v1
@@ -748,25 +748,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; HAWAII-NEXT: s_cbranch_execnz .LBB18_1
; HAWAII-NEXT: ; %bb.2:
; HAWAII-NEXT: s_mov_b64 exec, s[54:55]
+; HAWAII-NEXT: v_readlane_b32 s30, v40, 16
; HAWAII-NEXT: v_mov_b32_e32 v0, v4
-; HAWAII-NEXT: v_readlane_b32 s65, v40, 17
-; HAWAII-NEXT: v_readlane_b32 s64, v40, 16
-; HAWAII-NEXT: v_readlane_b32 s55, v40, 15
-; HAWAII-NEXT: v_readlane_b32 s54, v40, 14
-; HAWAII-NEXT: v_readlane_b32 s53, v40, 13
-; HAWAII-NEXT: v_readlane_b32 s52, v40, 12
-; HAWAII-NEXT: v_readlane_b32 s51, v40, 11
-; HAWAII-NEXT: v_readlane_b32 s50, v40, 10
-; HAWAII-NEXT: v_readlane_b32 s49, v40, 9
-; HAWAII-NEXT: v_readlane_b32 s48, v40, 8
-; HAWAII-NEXT: v_readlane_b32 s39, v40, 7
-; HAWAII-NEXT: v_readlane_b32 s38, v40, 6
-; HAWAII-NEXT: v_readlane_b32 s37, v40, 5
-; HAWAII-NEXT: v_readlane_b32 s36, v40, 4
-; HAWAII-NEXT: v_readlane_b32 s35, v40, 3
-; HAWAII-NEXT: v_readlane_b32 s34, v40, 2
-; HAWAII-NEXT: v_readlane_b32 s31, v40, 1
-; HAWAII-NEXT: v_readlane_b32 s30, v40, 0
+; HAWAII-NEXT: v_readlane_b32 s31, v40, 17
+; HAWAII-NEXT: v_readlane_b32 s65, v40, 15
+; HAWAII-NEXT: v_readlane_b32 s64, v40, 14
+; HAWAII-NEXT: v_readlane_b32 s55, v40, 13
+; HAWAII-NEXT: v_readlane_b32 s54, v40, 12
+; HAWAII-NEXT: v_readlane_b32 s53, v40, 11
+; HAWAII-NEXT: v_readlane_b32 s52, v40, 10
+; HAWAII-NEXT: v_readlane_b32 s51, v40, 9
+; HAWAII-NEXT: v_readlane_b32 s50, v40, 8
+; HAWAII-NEXT: v_readlane_b32 s49, v40, 7
+; HAWAII-NEXT: v_readlane_b32 s48, v40, 6
+; HAWAII-NEXT: v_readlane_b32 s39, v40, 5
+; HAWAII-NEXT: v_readlane_b32 s38, v40, 4
+; HAWAII-NEXT: v_readlane_b32 s37, v40, 3
+; HAWAII-NEXT: v_readlane_b32 s36, v40, 2
+; HAWAII-NEXT: v_readlane_b32 s35, v40, 1
+; HAWAII-NEXT: v_readlane_b32 s34, v40, 0
; HAWAII-NEXT: s_mov_b32 s32, s33
; HAWAII-NEXT: v_readlane_b32 s4, v40, 18
; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -785,23 +785,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-NEXT: v_writelane_b32 v40, s16, 18
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: v_writelane_b32 v40, s34, 2
-; GFX9-NEXT: v_writelane_b32 v40, s35, 3
-; GFX9-NEXT: v_writelane_b32 v40, s36, 4
-; GFX9-NEXT: v_writelane_b32 v40, s37, 5
-; GFX9-NEXT: v_writelane_b32 v40, s38, 6
-; GFX9-NEXT: v_writelane_b32 v40, s39, 7
-; GFX9-NEXT: v_writelane_b32 v40, s48, 8
-; GFX9-NEXT: v_writelane_b32 v40, s49, 9
-; GFX9-NEXT: v_writelane_b32 v40, s50, 10
-; GFX9-NEXT: v_writelane_b32 v40, s51, 11
-; GFX9-NEXT: v_writelane_b32 v40, s52, 12
-; GFX9-NEXT: v_writelane_b32 v40, s53, 13
-; GFX9-NEXT: v_writelane_b32 v40, s54, 14
-; GFX9-NEXT: v_writelane_b32 v40, s55, 15
-; GFX9-NEXT: v_writelane_b32 v40, s64, 16
+; GFX9-NEXT: v_writelane_b32 v40, s34, 0
+; GFX9-NEXT: v_writelane_b32 v40, s35, 1
+; GFX9-NEXT: v_writelane_b32 v40, s36, 2
+; GFX9-NEXT: v_writelane_b32 v40, s37, 3
+; GFX9-NEXT: v_writelane_b32 v40, s38, 4
+; GFX9-NEXT: v_writelane_b32 v40, s39, 5
+; GFX9-NEXT: v_writelane_b32 v40, s48, 6
+; GFX9-NEXT: v_writelane_b32 v40, s49, 7
+; GFX9-NEXT: v_writelane_b32 v40, s50, 8
+; GFX9-NEXT: v_writelane_b32 v40, s51, 9
+; GFX9-NEXT: v_writelane_b32 v40, s52, 10
+; GFX9-NEXT: v_writelane_b32 v40, s53, 11
+; GFX9-NEXT: v_writelane_b32 v40, s54, 12
+; GFX9-NEXT: v_writelane_b32 v40, s55, 13
+; GFX9-NEXT: v_writelane_b32 v40, s64, 14
+; GFX9-NEXT: v_writelane_b32 v40, s65, 15
+; GFX9-NEXT: v_writelane_b32 v40, s30, 16
; GFX9-NEXT: s_mov_b32 s50, s15
; GFX9-NEXT: s_mov_b32 s51, s14
; GFX9-NEXT: s_mov_b32 s52, s13
@@ -813,7 +813,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
; GFX9-NEXT: s_mov_b64 s[54:55], exec
; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s65, 17
+; GFX9-NEXT: v_writelane_b32 v40, s31, 17
; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
; GFX9-NEXT: v_readfirstlane_b32 s17, v1
@@ -839,25 +839,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
; GFX9-NEXT: s_cbranch_execnz .LBB18_1
; GFX9-NEXT: ; %bb.2:
; GFX9-NEXT: s_mov_b64 exec, s[54:55]
+; GFX9-NEXT: v_readlane_b32 s30, v40, 16
; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_readlane_b32 s65, v40, 17
-; GFX9-NEXT: v_readlane_b32 s64, v40, 16
-; GFX9-NEXT: v_readlane_b32 s55, v40, 15
-; GFX9-NEXT: v_readlane_b32 s54, v40, 14
-; GFX9-NEXT: v_readlane_b32 s53, v40, 13
-; GFX9-NEXT: v_readlane_b32 s52, v40, 12
-; GFX9-NEXT: v_readlane_b32 s51, v40, 11
-; GFX9-NEXT: v_readlane_b32 s50, v40, 10
-; GFX9-NEXT: v_readlane_b32 s49, v40, 9
-; GFX9-NEXT: v_readlane_b32 s48, v40, 8
-; GFX9-NEXT: v_readlane_b32 s39, v40, 7
-; GFX9-NEXT: v_readlane_b32 s38, v40, 6
-; GFX9-NEXT: v_readlane_b32 s37, v40, 5
-; GFX9-NEXT: v_readlane_b32 s36, v40, 4
-; GFX9-NEXT: v_readlane_b32 s35, v40, 3
-; GFX9-NEXT: v_readlane_b32 s34, v40, 2
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 17
+; GFX9-NEXT: v_readlane_b32 s65, v40, 15
+; GFX9-NEXT: v_readlane_b32 s64, v40, 14
+; GFX9-NEXT: v_readlane_b32 s55, v40, 13
+; GFX9-NEXT: v_readlane_b32 s54, v40, 12
+; GFX9-NEXT: v_readlane_b32 s53, v40, 11
+; GFX9-NEXT: v_readlane_b32 s52, v40, 10
+; GFX9-NEXT: v_readlane_b32 s51, v40, 9
+; GFX9-NEXT: v_readlane_b32 s50, v40, 8
+; GFX9-NEXT: v_readlane_b32 s49, v40, 7
+; GFX9-NEXT: v_readlane_b32 s48, v40, 6
+; GFX9-NEXT: v_readlane_b32 s39, v40, 5
+; GFX9-NEXT: v_readlane_b32 s38, v40, 4
+; GFX9-NEXT: v_readlane_b32 s37, v40, 3
+; GFX9-NEXT: v_readlane_b32 s36, v40, 2
+; GFX9-NEXT: v_readlane_b32 s35, v40, 1
+; GFX9-NEXT: v_readlane_b32 s34, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 18
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 5ef54268c9372..540737672ed15 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -311,8 +311,8 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: s_mov_b32 s32, s34
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: v_readlane_b32 s34, v40, 3
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index 838ecf9afff2f..7112fd9e1af22 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -1283,11 +1283,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE32-OPT-NEXT: s_mov_b32 s32, s18
+; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0
; WAVE32-OPT-NEXT: ;;#ASMSTART
; WAVE32-OPT-NEXT: ; use s19
; WAVE32-OPT-NEXT: ;;#ASMEND
; WAVE32-OPT-NEXT: v_readlane_b32 s31, v32, 1
-; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0
; WAVE32-OPT-NEXT: s_mov_b32 s32, s33
; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1
; WAVE32-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1318,11 +1318,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE64-OPT-NEXT: s_mov_b32 s32, s18
+; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0
; WAVE64-OPT-NEXT: ;;#ASMSTART
; WAVE64-OPT-NEXT: ; use s19
; WAVE64-OPT-NEXT: ;;#ASMEND
; WAVE64-OPT-NEXT: v_readlane_b32 s31, v32, 1
-; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0
; WAVE64-OPT-NEXT: s_mov_b32 s32, s33
; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1
; WAVE64-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1431,8 +1431,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-O0-NEXT: ; use s5
; WAVE32-O0-NEXT: ;;#ASMEND
; WAVE32-O0-NEXT: s_mov_b32 s32, s4
-; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1
; WAVE32-O0-NEXT: v_readlane_b32 s30, v32, 0
+; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1
; WAVE32-O0-NEXT: s_mov_b32 s32, s33
; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1
; WAVE32-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1542,8 +1542,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE64-O0-NEXT: ; use s5
; WAVE64-O0-NEXT: ;;#ASMEND
; WAVE64-O0-NEXT: s_mov_b32 s32, s4
-; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1
; WAVE64-O0-NEXT: v_readlane_b32 s30, v32, 0
+; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1
; WAVE64-O0-NEXT: s_mov_b32 s32, s33
; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; WAVE64-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
@@ -1653,8 +1653,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-WWM-PREALLOC-NEXT: ; use s5
; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4
-; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1
; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s33
; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1
; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 034119b98790f..05ea168c9ec7c 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -175,8 +175,8 @@ define void @outgoing_f16_arg(ptr %ptr) #0 {
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
+; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -207,8 +207,8 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 {
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
+; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -245,8 +245,8 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
; GFX7-NEXT: flat_store_short v[40:41], v0
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
+; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -283,8 +283,8 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
; GFX7-NEXT: flat_store_dword v[40:41], v0
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
+; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -324,8 +324,8 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
; GFX7-NEXT: flat_store_dword v[40:41], v0
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
+; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -371,8 +371,8 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
; GFX7-NEXT: flat_store_dword v[40:41], v0
; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
+; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -402,8 +402,8 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
+; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
index 5c6fcd4f977e3..13cde61ff16a0 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll
@@ -18,11 +18,12 @@ define void @test_load_zext() #0 {
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: s_mov_b32 s0, DescriptorBuffer at abs32@lo
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s0, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index e78d62561238b..e5215fe1acdef 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -646,29 +646,30 @@ define i32 @s_in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
; GCN-NEXT: s_or_saveexec_b32 s16, -1
; GCN-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b32 exec_lo, s16
-; GCN-NEXT: v_writelane_b32 v40, s2, 4
; GCN-NEXT: s_add_i32 s32, s32, 16
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
-; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: v_writelane_b32 v40, s2, 4
; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
+; GCN-NEXT: s_xor_b32 s0, s0, s1
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
; GCN-NEXT: s_mov_b32 s34, s1
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
; GCN-NEXT: s_and_b32 s35, s0, s3
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s35
+; GCN-NEXT: v_writelane_b32 v40, s30, 2
+; GCN-NEXT: v_writelane_b32 v40, s31, 3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_xor_b32 s0, s35, s34
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_readlane_b32 s30, v40, 2
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 3
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s0, v40, 4
; GCN-NEXT: s_or_saveexec_b32 s1, -1
@@ -702,20 +703,21 @@ define i32 @s_in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg
; GCN-NEXT: s_xor_b32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
-; GCN-NEXT: v_writelane_b32 v40, s34, 2
+; GCN-NEXT: v_writelane_b32 v40, s34, 0
; GCN-NEXT: s_mov_b32 s34, s1
-; GCN-NEXT: v_writelane_b32 v40, s35, 3
+; GCN-NEXT: v_writelane_b32 v40, s35, 1
; GCN-NEXT: s_and_b32 s35, s0, s3
+; GCN-NEXT: v_writelane_b32 v40, s30, 2
+; GCN-NEXT: v_writelane_b32 v40, s31, 3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_xor_b32 s0, s35, s34
-; GCN-NEXT: v_readlane_b32 s35, v40, 3
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_readlane_b32 s30, v40, 2
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_readlane_b32 s34, v40, 2
-; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: v_readlane_b32 s31, v40, 3
+; GCN-NEXT: v_readlane_b32 s35, v40, 1
+; GCN-NEXT: v_readlane_b32 s34, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s0, v40, 4
; GCN-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 26d8a047e6541..d04c1b970187e 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -14,22 +14,22 @@ define hidden void @widget() {
; GCN-NEXT: v_writelane_b32 v41, s16, 16
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v41, s30, 0
-; GCN-NEXT: v_writelane_b32 v41, s31, 1
-; GCN-NEXT: v_writelane_b32 v41, s34, 2
-; GCN-NEXT: v_writelane_b32 v41, s35, 3
-; GCN-NEXT: v_writelane_b32 v41, s36, 4
-; GCN-NEXT: v_writelane_b32 v41, s37, 5
-; GCN-NEXT: v_writelane_b32 v41, s38, 6
-; GCN-NEXT: v_writelane_b32 v41, s39, 7
-; GCN-NEXT: v_writelane_b32 v41, s48, 8
-; GCN-NEXT: v_writelane_b32 v41, s49, 9
-; GCN-NEXT: v_writelane_b32 v41, s50, 10
-; GCN-NEXT: v_writelane_b32 v41, s51, 11
-; GCN-NEXT: v_writelane_b32 v41, s52, 12
-; GCN-NEXT: v_writelane_b32 v41, s53, 13
-; GCN-NEXT: v_writelane_b32 v41, s54, 14
-; GCN-NEXT: v_writelane_b32 v41, s55, 15
+; GCN-NEXT: v_writelane_b32 v41, s34, 0
+; GCN-NEXT: v_writelane_b32 v41, s35, 1
+; GCN-NEXT: v_writelane_b32 v41, s36, 2
+; GCN-NEXT: v_writelane_b32 v41, s37, 3
+; GCN-NEXT: v_writelane_b32 v41, s38, 4
+; GCN-NEXT: v_writelane_b32 v41, s39, 5
+; GCN-NEXT: v_writelane_b32 v41, s48, 6
+; GCN-NEXT: v_writelane_b32 v41, s49, 7
+; GCN-NEXT: v_writelane_b32 v41, s50, 8
+; GCN-NEXT: v_writelane_b32 v41, s51, 9
+; GCN-NEXT: v_writelane_b32 v41, s52, 10
+; GCN-NEXT: v_writelane_b32 v41, s53, 11
+; GCN-NEXT: v_writelane_b32 v41, s54, 12
+; GCN-NEXT: v_writelane_b32 v41, s55, 13
+; GCN-NEXT: v_writelane_b32 v41, s30, 14
+; GCN-NEXT: v_writelane_b32 v41, s31, 15
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: flat_load_dword v0, v[0:1]
@@ -93,22 +93,22 @@ define hidden void @widget() {
; GCN-NEXT: s_addc_u32 s17, s17, wibble at rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock
-; GCN-NEXT: v_readlane_b32 s55, v41, 15
-; GCN-NEXT: v_readlane_b32 s54, v41, 14
-; GCN-NEXT: v_readlane_b32 s53, v41, 13
-; GCN-NEXT: v_readlane_b32 s52, v41, 12
-; GCN-NEXT: v_readlane_b32 s51, v41, 11
-; GCN-NEXT: v_readlane_b32 s50, v41, 10
-; GCN-NEXT: v_readlane_b32 s49, v41, 9
-; GCN-NEXT: v_readlane_b32 s48, v41, 8
-; GCN-NEXT: v_readlane_b32 s39, v41, 7
-; GCN-NEXT: v_readlane_b32 s38, v41, 6
-; GCN-NEXT: v_readlane_b32 s37, v41, 5
-; GCN-NEXT: v_readlane_b32 s36, v41, 4
-; GCN-NEXT: v_readlane_b32 s35, v41, 3
-; GCN-NEXT: v_readlane_b32 s34, v41, 2
-; GCN-NEXT: v_readlane_b32 s31, v41, 1
-; GCN-NEXT: v_readlane_b32 s30, v41, 0
+; GCN-NEXT: v_readlane_b32 s30, v41, 14
+; GCN-NEXT: v_readlane_b32 s31, v41, 15
+; GCN-NEXT: v_readlane_b32 s55, v41, 13
+; GCN-NEXT: v_readlane_b32 s54, v41, 12
+; GCN-NEXT: v_readlane_b32 s53, v41, 11
+; GCN-NEXT: v_readlane_b32 s52, v41, 10
+; GCN-NEXT: v_readlane_b32 s51, v41, 9
+; GCN-NEXT: v_readlane_b32 s50, v41, 8
+; GCN-NEXT: v_readlane_b32 s49, v41, 7
+; GCN-NEXT: v_readlane_b32 s48, v41, 6
+; GCN-NEXT: v_readlane_b32 s39, v41, 5
+; GCN-NEXT: v_readlane_b32 s38, v41, 4
+; GCN-NEXT: v_readlane_b32 s37, v41, 3
+; GCN-NEXT: v_readlane_b32 s36, v41, 2
+; GCN-NEXT: v_readlane_b32 s35, v41, 1
+; GCN-NEXT: v_readlane_b32 s34, v41, 0
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v41, 16
@@ -266,32 +266,32 @@ define hidden void @blam() {
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_writelane_b32 v45, s30, 0
-; GCN-NEXT: v_writelane_b32 v45, s31, 1
-; GCN-NEXT: v_writelane_b32 v45, s34, 2
-; GCN-NEXT: v_writelane_b32 v45, s35, 3
-; GCN-NEXT: v_writelane_b32 v45, s36, 4
-; GCN-NEXT: v_writelane_b32 v45, s37, 5
-; GCN-NEXT: v_writelane_b32 v45, s38, 6
-; GCN-NEXT: v_writelane_b32 v45, s39, 7
-; GCN-NEXT: v_writelane_b32 v45, s48, 8
-; GCN-NEXT: v_writelane_b32 v45, s49, 9
-; GCN-NEXT: v_writelane_b32 v45, s50, 10
-; GCN-NEXT: v_writelane_b32 v45, s51, 11
-; GCN-NEXT: v_writelane_b32 v45, s52, 12
-; GCN-NEXT: v_writelane_b32 v45, s53, 13
-; GCN-NEXT: v_writelane_b32 v45, s54, 14
-; GCN-NEXT: v_writelane_b32 v45, s55, 15
-; GCN-NEXT: v_writelane_b32 v45, s64, 16
-; GCN-NEXT: v_writelane_b32 v45, s65, 17
-; GCN-NEXT: v_writelane_b32 v45, s66, 18
-; GCN-NEXT: v_writelane_b32 v45, s67, 19
-; GCN-NEXT: v_writelane_b32 v45, s68, 20
-; GCN-NEXT: v_writelane_b32 v45, s69, 21
-; GCN-NEXT: v_writelane_b32 v45, s70, 22
-; GCN-NEXT: v_writelane_b32 v45, s71, 23
-; GCN-NEXT: v_writelane_b32 v45, s80, 24
-; GCN-NEXT: v_writelane_b32 v45, s81, 25
+; GCN-NEXT: v_writelane_b32 v45, s34, 0
+; GCN-NEXT: v_writelane_b32 v45, s35, 1
+; GCN-NEXT: v_writelane_b32 v45, s36, 2
+; GCN-NEXT: v_writelane_b32 v45, s37, 3
+; GCN-NEXT: v_writelane_b32 v45, s38, 4
+; GCN-NEXT: v_writelane_b32 v45, s39, 5
+; GCN-NEXT: v_writelane_b32 v45, s48, 6
+; GCN-NEXT: v_writelane_b32 v45, s49, 7
+; GCN-NEXT: v_writelane_b32 v45, s50, 8
+; GCN-NEXT: v_writelane_b32 v45, s51, 9
+; GCN-NEXT: v_writelane_b32 v45, s52, 10
+; GCN-NEXT: v_writelane_b32 v45, s53, 11
+; GCN-NEXT: v_writelane_b32 v45, s54, 12
+; GCN-NEXT: v_writelane_b32 v45, s55, 13
+; GCN-NEXT: v_writelane_b32 v45, s64, 14
+; GCN-NEXT: v_writelane_b32 v45, s65, 15
+; GCN-NEXT: v_writelane_b32 v45, s66, 16
+; GCN-NEXT: v_writelane_b32 v45, s67, 17
+; GCN-NEXT: v_writelane_b32 v45, s68, 18
+; GCN-NEXT: v_writelane_b32 v45, s69, 19
+; GCN-NEXT: v_writelane_b32 v45, s70, 20
+; GCN-NEXT: v_writelane_b32 v45, s71, 21
+; GCN-NEXT: v_writelane_b32 v45, s80, 22
+; GCN-NEXT: v_writelane_b32 v45, s81, 23
+; GCN-NEXT: v_writelane_b32 v45, s30, 24
+; GCN-NEXT: v_writelane_b32 v45, s31, 25
; GCN-NEXT: v_mov_b32_e32 v40, v31
; GCN-NEXT: s_mov_b32 s54, s15
; GCN-NEXT: s_mov_b32 s55, s14
@@ -427,32 +427,32 @@ define hidden void @blam() {
; GCN-NEXT: s_branch .LBB1_1
; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock
; GCN-NEXT: s_or_b64 exec, exec, s[66:67]
-; GCN-NEXT: v_readlane_b32 s81, v45, 25
-; GCN-NEXT: v_readlane_b32 s80, v45, 24
-; GCN-NEXT: v_readlane_b32 s71, v45, 23
-; GCN-NEXT: v_readlane_b32 s70, v45, 22
-; GCN-NEXT: v_readlane_b32 s69, v45, 21
-; GCN-NEXT: v_readlane_b32 s68, v45, 20
-; GCN-NEXT: v_readlane_b32 s67, v45, 19
-; GCN-NEXT: v_readlane_b32 s66, v45, 18
-; GCN-NEXT: v_readlane_b32 s65, v45, 17
-; GCN-NEXT: v_readlane_b32 s64, v45, 16
-; GCN-NEXT: v_readlane_b32 s55, v45, 15
-; GCN-NEXT: v_readlane_b32 s54, v45, 14
-; GCN-NEXT: v_readlane_b32 s53, v45, 13
-; GCN-NEXT: v_readlane_b32 s52, v45, 12
-; GCN-NEXT: v_readlane_b32 s51, v45, 11
-; GCN-NEXT: v_readlane_b32 s50, v45, 10
-; GCN-NEXT: v_readlane_b32 s49, v45, 9
-; GCN-NEXT: v_readlane_b32 s48, v45, 8
-; GCN-NEXT: v_readlane_b32 s39, v45, 7
-; GCN-NEXT: v_readlane_b32 s38, v45, 6
-; GCN-NEXT: v_readlane_b32 s37, v45, 5
-; GCN-NEXT: v_readlane_b32 s36, v45, 4
-; GCN-NEXT: v_readlane_b32 s35, v45, 3
-; GCN-NEXT: v_readlane_b32 s34, v45, 2
-; GCN-NEXT: v_readlane_b32 s31, v45, 1
-; GCN-NEXT: v_readlane_b32 s30, v45, 0
+; GCN-NEXT: v_readlane_b32 s30, v45, 24
+; GCN-NEXT: v_readlane_b32 s31, v45, 25
+; GCN-NEXT: v_readlane_b32 s81, v45, 23
+; GCN-NEXT: v_readlane_b32 s80, v45, 22
+; GCN-NEXT: v_readlane_b32 s71, v45, 21
+; GCN-NEXT: v_readlane_b32 s70, v45, 20
+; GCN-NEXT: v_readlane_b32 s69, v45, 19
+; GCN-NEXT: v_readlane_b32 s68, v45, 18
+; GCN-NEXT: v_readlane_b32 s67, v45, 17
+; GCN-NEXT: v_readlane_b32 s66, v45, 16
+; GCN-NEXT: v_readlane_b32 s65, v45, 15
+; GCN-NEXT: v_readlane_b32 s64, v45, 14
+; GCN-NEXT: v_readlane_b32 s55, v45, 13
+; GCN-NEXT: v_readlane_b32 s54, v45, 12
+; GCN-NEXT: v_readlane_b32 s53, v45, 11
+; GCN-NEXT: v_readlane_b32 s52, v45, 10
+; GCN-NEXT: v_readlane_b32 s51, v45, 9
+; GCN-NEXT: v_readlane_b32 s50, v45, 8
+; GCN-NEXT: v_readlane_b32 s49, v45, 7
+; GCN-NEXT: v_readlane_b32 s48, v45, 6
+; GCN-NEXT: v_readlane_b32 s39, v45, 5
+; GCN-NEXT: v_readlane_b32 s38, v45, 4
+; GCN-NEXT: v_readlane_b32 s37, v45, 3
+; GCN-NEXT: v_readlane_b32 s36, v45, 2
+; GCN-NEXT: v_readlane_b32 s35, v45, 1
+; GCN-NEXT: v_readlane_b32 s34, v45, 0
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index ff1475758382f..580ef1522ee14 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -52,8 +52,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v44, 1
; GFX9-NEXT: v_readlane_b32 s30, v44, 0
+; GFX9-NEXT: v_readlane_b32 s31, v44, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v44, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -109,8 +109,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12
-; GFX10-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-NEXT: v_readlane_b32 s30, v44, 0
+; GFX10-NEXT: v_readlane_b32 s31, v44, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s4, v44, 2
; GFX10-NEXT: s_or_saveexec_b32 s5, -1
@@ -163,8 +163,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-NEXT: v_readlane_b32 s30, v44, 0
+; GFX11-NEXT: v_readlane_b32 s31, v44, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v44, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
@@ -236,8 +236,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s31, v45, 1
; GFX9-NEXT: v_readlane_b32 s30, v45, 0
+; GFX9-NEXT: v_readlane_b32 s31, v45, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v45, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -286,8 +286,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16
-; GFX10-NEXT: v_readlane_b32 s31, v45, 1
; GFX10-NEXT: v_readlane_b32 s30, v45, 0
+; GFX10-NEXT: v_readlane_b32 s31, v45, 1
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: v_readlane_b32 s4, v45, 2
; GFX10-NEXT: s_or_saveexec_b32 s5, -1
@@ -335,8 +335,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16
-; GFX11-NEXT: v_readlane_b32 s31, v45, 1
; GFX11-NEXT: v_readlane_b32 s30, v45, 0
+; GFX11-NEXT: v_readlane_b32 s31, v45, 1
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_readlane_b32 s0, v45, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
index 93d864246d68d..b685a79027ba2 100644
--- a/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
@@ -118,8 +118,8 @@ define i32 @fix_sgpr_copies_indirect_call(ptr addrspace(5) %ptr) {
; CHECK-NEXT: v_readlane_b32 s5, v41, 13
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 4
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4a00a09850a58..50220b3e8cd7e 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -3085,8 +3085,8 @@ define void @callee_no_stack_with_call() #1 {
; GFX1032-NEXT: v_writelane_b32 v40, s31, 1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_readlane_b32 s31, v40, 1
; GFX1032-NEXT: v_readlane_b32 s30, v40, 0
+; GFX1032-NEXT: v_readlane_b32 s31, v40, 1
; GFX1032-NEXT: s_mov_b32 s32, s33
; GFX1032-NEXT: v_readlane_b32 s4, v40, 2
; GFX1032-NEXT: s_or_saveexec_b32 s5, -1
@@ -3116,8 +3116,8 @@ define void @callee_no_stack_with_call() #1 {
; GFX1064-NEXT: v_writelane_b32 v40, s31, 1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_readlane_b32 s31, v40, 1
; GFX1064-NEXT: v_readlane_b32 s30, v40, 0
+; GFX1064-NEXT: v_readlane_b32 s31, v40, 1
; GFX1064-NEXT: s_mov_b32 s32, s33
; GFX1064-NEXT: v_readlane_b32 s4, v40, 2
; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index bdef2449c5e0b..250d7beb47e23 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -1593,8 +1593,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -1929,8 +1929,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT: v_readlane_b32 s31, v40, 2
; GISEL-NEXT: v_readlane_b32 s30, v40, 1
+; GISEL-NEXT: v_readlane_b32 s31, v40, 2
; GISEL-NEXT: v_readlane_b32 s4, v40, 0
; GISEL-NEXT: v_readlane_b32 s0, v40, 3
; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
@@ -2266,8 +2266,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; DAGISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3
; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3
; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1
; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0
; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4
@@ -2604,8 +2604,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL64-NEXT: v_readlane_b32 s31, v40, 3
; GISEL64-NEXT: v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT: v_readlane_b32 s31, v40, 3
; GISEL64-NEXT: v_readlane_b32 s5, v40, 1
; GISEL64-NEXT: v_readlane_b32 s4, v40, 0
; GISEL64-NEXT: v_readlane_b32 s0, v40, 4
@@ -3719,8 +3719,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s31, 2
; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v40, 1
+; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s4, v40, 0
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s0, v40, 3
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40, off, s33 nv ; 4-byte Folded Reload
@@ -8048,9 +8048,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; DAGISEL-NEXT: v_writelane_b32 v42, s31, 2
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
; DAGISEL-NEXT: flat_store_b32 v[40:41], v0
; DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
-; DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
; DAGISEL-NEXT: v_readlane_b32 s4, v42, 0
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 3
; DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload
@@ -8389,9 +8390,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GISEL-NEXT: v_writelane_b32 v42, s31, 2
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_readlane_b32 s30, v42, 1
; GISEL-NEXT: flat_store_b32 v[40:41], v0
; GISEL-NEXT: v_readlane_b32 s31, v42, 2
-; GISEL-NEXT: v_readlane_b32 s30, v42, 1
; GISEL-NEXT: v_readlane_b32 s4, v42, 0
; GISEL-NEXT: v_readlane_b32 s0, v42, 3
; GISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload
@@ -8732,9 +8734,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; DAGISEL64-NEXT: v_writelane_b32 v42, s31, 3
; DAGISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_readlane_b32 s30, v42, 2
; DAGISEL64-NEXT: flat_store_b32 v[40:41], v0
; DAGISEL64-NEXT: v_readlane_b32 s31, v42, 3
-; DAGISEL64-NEXT: v_readlane_b32 s30, v42, 2
; DAGISEL64-NEXT: v_readlane_b32 s5, v42, 1
; DAGISEL64-NEXT: v_readlane_b32 s4, v42, 0
; DAGISEL64-NEXT: v_readlane_b32 s0, v42, 4
@@ -9076,9 +9079,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GISEL64-NEXT: v_writelane_b32 v42, s31, 3
; GISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_readlane_b32 s30, v42, 2
; GISEL64-NEXT: flat_store_b32 v[40:41], v0
; GISEL64-NEXT: v_readlane_b32 s31, v42, 3
-; GISEL64-NEXT: v_readlane_b32 s30, v42, 2
; GISEL64-NEXT: v_readlane_b32 s5, v42, 1
; GISEL64-NEXT: v_readlane_b32 s4, v42, 0
; GISEL64-NEXT: v_readlane_b32 s0, v42, 4
@@ -10197,9 +10201,10 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v42, s30, 1
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v42, s31, 2
; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
; GFX1250-DAGISEL-NEXT: flat_store_b32 v[40:41], v0
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
-; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s4, v42, 0
; GFX1250-DAGISEL-NEXT: v_readlane_b32 s0, v42, 3
; GFX1250-DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index 06c451869e841..3fe54cd045c0f 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -41,12 +41,12 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1
; GFX90A-NEXT: v_accvgpr_read_b32 v39, a32
; GFX90A-NEXT: s_mov_b64 exec, s[28:29]
+; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-NEXT: v_readlane_b32 s20, v39, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use s20
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
-; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-NEXT: s_mov_b32 s32, s33
; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
; GFX90A-NEXT: v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index ff33cca0702ae..5009f0249df6d 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -49,10 +49,10 @@ define void @test() #0 {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v39, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v40, 1
-; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: v_readlane_b32 s4, v40, 4
; GCN-NEXT: v_readlane_b32 s28, v40, 2
@@ -111,8 +111,8 @@ define void @test() #0 {
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: global_store_dword v[0:1], v2, off
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1
; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1
; GCN-O0-NEXT: s_mov_b32 s32, s33
; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4
; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e9a0671ead4e0..fe641367944be 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -387,8 +387,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O0-NEXT: s_mov_b32 s32, s33
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -424,9 +424,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O3-NEXT: s_mov_b32 s32, s33
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -622,8 +622,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: s_mov_b32 s34, 0
; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1
; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0
+; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1
; GFX9-O0-NEXT: s_mov_b32 s32, s33
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -683,9 +683,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0
; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0
; GFX9-O3-NEXT: s_mov_b32 s32, s33
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
More information about the llvm-branch-commits
mailing list