[llvm-branch-commits] [llvm] [AMDGPU] Add liverange split instructions into BB Prolog (PR #117544)

Christudasan Devadasan via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Nov 6 22:51:01 PST 2025


https://github.com/cdevadas updated https://github.com/llvm/llvm-project/pull/117544

>From 80232b5877a5b48e22881a45c84097299406b078 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Tue, 19 Nov 2024 12:14:05 +0530
Subject: [PATCH] [AMDGPU] Add liverange split instructions into BB Prolog

The COPY inserted for liverange split during sgpr-regalloc
pipeline currently breaks the BB prolog during the subsequent
vgpr-regalloc phase while spilling and/or splitting the vector
liveranges. This patch fixes it by correctly including the
the LR split instructions during sgpr-regalloc and wwm-regalloc
pipelines into the BB prolog.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   34 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |    2 +
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 1216 +++++++++--------
 .../ran-out-of-sgprs-allocation-failure.mir   |  120 +-
 4 files changed, 707 insertions(+), 665 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..795b50e76dae7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9709,6 +9709,30 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
   return AMDGPU::COPY;
 }
 
+bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
+  uint16_t Opcode = MI.getOpcode();
+  // Check if it is SGPR spill or wwm-register spill Opcode.
+  if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
+    return true;
+
+  const MachineFunction *MF = MI.getMF();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  // See if this is Liverange split instruction inserted for SGPR or
+  // wwm-register. The implicit def inserted for wwm-registers should also be
+  // included as they can appear at the bb begin.
+  bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
+  if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
+    return false;
+
+  Register Reg = MI.getOperand(0).getReg();
+  if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
+    return IsLRSplitInst;
+
+  return MFI->isWWMReg(Reg);
+}
+
 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
                                        Register Reg) const {
   // We need to handle instructions which may be inserted during register
@@ -9717,20 +9741,16 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
   // needed by the prolog. However, the insertions for scalar registers can
   // always be placed at the BB top as they are independent of the exec mask
   // value.
-  const MachineFunction *MF = MI.getParent()->getParent();
   bool IsNullOrVectorRegister = true;
   if (Reg) {
+    const MachineFunction *MF = MI.getMF();
     const MachineRegisterInfo &MRI = MF->getRegInfo();
     IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
   }
 
-  uint16_t Opcode = MI.getOpcode();
-  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   return IsNullOrVectorRegister &&
-         (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
-          (Opcode == AMDGPU::IMPLICIT_DEF &&
-           MFI->isWWMReg(MI.getOperand(0).getReg())) ||
-          (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
+         (canAddToBBProlog(MI) ||
+          (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
            MI.modifiesRegister(AMDGPU::EXEC, &RI)));
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0643b532ea04c..71ed966d90e63 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1561,6 +1561,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool isBasicBlockPrologue(const MachineInstr &MI,
                             Register Reg = Register()) const override;
 
+  bool canAddToBBProlog(const MachineInstr &MI) const;
+
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator InsPt,
                                          const DebugLoc &DL, Register Src,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 08e64da632d3b..e6946a110dc12 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -181571,13 +181571,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
 ; SI-NEXT:    ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
-; SI-NEXT:    s_mov_b32 s10, s16
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_writelane_b32 v63, s30, 0
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_writelane_b32 v61, s29, 0
 ; SI-NEXT:    v_writelane_b32 v61, s28, 1
 ; SI-NEXT:    v_writelane_b32 v61, s27, 2
-; SI-NEXT:    s_mov_b32 s61, s21
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
+; SI-NEXT:    v_writelane_b32 v61, s26, 3
+; SI-NEXT:    v_writelane_b32 v61, s25, 4
+; SI-NEXT:    v_writelane_b32 v61, s24, 5
+; SI-NEXT:    v_writelane_b32 v61, s23, 6
+; SI-NEXT:    v_writelane_b32 v61, s22, 7
+; SI-NEXT:    v_writelane_b32 v61, s21, 8
 ; SI-NEXT:    v_writelane_b32 v63, s31, 1
 ; SI-NEXT:    v_writelane_b32 v63, s34, 2
 ; SI-NEXT:    v_writelane_b32 v63, s35, 3
@@ -181611,59 +181616,52 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_writelane_b32 v63, s87, 31
 ; SI-NEXT:    v_writelane_b32 v63, s96, 32
 ; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    s_mov_b32 s67, s19
-; SI-NEXT:    s_mov_b32 s54, s17
-; SI-NEXT:    s_mov_b32 s35, s23
-; SI-NEXT:    s_mov_b32 s39, s26
-; SI-NEXT:    s_mov_b32 s62, s25
+; SI-NEXT:    s_mov_b32 s72, s19
+; SI-NEXT:    s_mov_b32 s73, s17
+; SI-NEXT:    s_mov_b32 s60, s20
 ; SI-NEXT:    v_writelane_b32 v63, s98, 34
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
-; SI-NEXT:    v_readfirstlane_b32 s99, v1
-; SI-NEXT:    v_readfirstlane_b32 s74, v24
+; SI-NEXT:    v_readfirstlane_b32 s31, v1
+; SI-NEXT:    v_readfirstlane_b32 s12, v28
 ; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s6, v23
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s74, 0
-; SI-NEXT:    v_readfirstlane_b32 s12, v26
-; SI-NEXT:    v_writelane_b32 v62, s6, 1
-; SI-NEXT:    v_readfirstlane_b32 s14, v25
-; SI-NEXT:    v_writelane_b32 v62, s12, 2
-; SI-NEXT:    v_readfirstlane_b32 s46, v28
-; SI-NEXT:    v_writelane_b32 v62, s14, 3
-; SI-NEXT:    v_readfirstlane_b32 s56, v27
-; SI-NEXT:    v_writelane_b32 v62, s46, 4
-; SI-NEXT:    v_readfirstlane_b32 s57, v30
-; SI-NEXT:    v_writelane_b32 v62, s56, 5
-; SI-NEXT:    v_readfirstlane_b32 s59, v29
-; SI-NEXT:    v_writelane_b32 v62, s57, 6
-; SI-NEXT:    v_writelane_b32 v62, s59, 7
-; SI-NEXT:    s_mov_b32 s60, s20
-; SI-NEXT:    s_mov_b32 s63, s24
-; SI-NEXT:    v_readfirstlane_b32 s95, v3
-; SI-NEXT:    v_readfirstlane_b32 s31, v5
-; SI-NEXT:    v_readfirstlane_b32 s24, v9
-; SI-NEXT:    v_readfirstlane_b32 s38, v12
-; SI-NEXT:    v_readfirstlane_b32 s36, v11
-; SI-NEXT:    v_readfirstlane_b32 s8, v14
-; SI-NEXT:    v_readfirstlane_b32 s27, v13
-; SI-NEXT:    v_readfirstlane_b32 s9, v16
-; SI-NEXT:    v_readfirstlane_b32 s79, v15
-; SI-NEXT:    v_readfirstlane_b32 s13, v18
-; SI-NEXT:    v_readfirstlane_b32 s15, v17
-; SI-NEXT:    v_readfirstlane_b32 s42, v20
-; SI-NEXT:    v_readfirstlane_b32 s43, v19
-; SI-NEXT:    v_readfirstlane_b32 s44, v22
+; SI-NEXT:    v_readfirstlane_b32 s41, v27
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v62, s12, 0
+; SI-NEXT:    v_readfirstlane_b32 s46, v30
+; SI-NEXT:    v_writelane_b32 v62, s41, 1
+; SI-NEXT:    v_readfirstlane_b32 s56, v29
+; SI-NEXT:    v_writelane_b32 v62, s46, 2
+; SI-NEXT:    v_writelane_b32 v62, s56, 3
+; SI-NEXT:    s_mov_b32 s10, s16
+; SI-NEXT:    v_readfirstlane_b32 s36, v3
+; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_readfirstlane_b32 s99, v6
+; SI-NEXT:    v_readfirstlane_b32 s94, v5
+; SI-NEXT:    v_readfirstlane_b32 s38, v7
+; SI-NEXT:    v_readfirstlane_b32 s91, v10
+; SI-NEXT:    v_readfirstlane_b32 s88, v9
+; SI-NEXT:    v_readfirstlane_b32 s90, v12
+; SI-NEXT:    v_readfirstlane_b32 s16, v11
+; SI-NEXT:    v_readfirstlane_b32 s24, v14
+; SI-NEXT:    v_readfirstlane_b32 s8, v13
+; SI-NEXT:    v_readfirstlane_b32 s27, v16
+; SI-NEXT:    v_readfirstlane_b32 s9, v15
+; SI-NEXT:    v_readfirstlane_b32 s79, v18
+; SI-NEXT:    v_readfirstlane_b32 s13, v17
+; SI-NEXT:    v_readfirstlane_b32 s40, v20
+; SI-NEXT:    v_readfirstlane_b32 s42, v19
+; SI-NEXT:    v_readfirstlane_b32 s43, v22
+; SI-NEXT:    v_readfirstlane_b32 s44, v21
+; SI-NEXT:    v_readfirstlane_b32 s78, v24
+; SI-NEXT:    v_readfirstlane_b32 s37, v23
+; SI-NEXT:    v_readfirstlane_b32 s28, v26
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:328
-; SI-NEXT:    v_writelane_b32 v61, s4, 3
-; SI-NEXT:    v_readfirstlane_b32 s45, v21
-; SI-NEXT:    v_readfirstlane_b32 s98, v10
-; SI-NEXT:    v_readfirstlane_b32 s90, v8
-; SI-NEXT:    v_readfirstlane_b32 s88, v7
-; SI-NEXT:    v_readfirstlane_b32 s91, v6
-; SI-NEXT:    v_readfirstlane_b32 s93, v4
-; SI-NEXT:    v_readfirstlane_b32 s55, v2
+; SI-NEXT:    v_writelane_b32 v61, s4, 9
+; SI-NEXT:    v_readfirstlane_b32 s7, v25
+; SI-NEXT:    v_readfirstlane_b32 s95, v8
+; SI-NEXT:    v_readfirstlane_b32 s96, v4
+; SI-NEXT:    v_readfirstlane_b32 s6, v2
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
@@ -181681,375 +181679,375 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:324
-; SI-NEXT:    v_writelane_b32 v61, s4, 4
+; SI-NEXT:    v_writelane_b32 v61, s4, 10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:320
-; SI-NEXT:    v_writelane_b32 v61, s4, 5
+; SI-NEXT:    v_writelane_b32 v61, s4, 11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:316
-; SI-NEXT:    v_writelane_b32 v61, s4, 6
+; SI-NEXT:    v_writelane_b32 v61, s4, 12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:312
-; SI-NEXT:    v_writelane_b32 v61, s4, 7
+; SI-NEXT:    v_writelane_b32 v61, s4, 13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:308
-; SI-NEXT:    v_writelane_b32 v61, s4, 8
+; SI-NEXT:    v_writelane_b32 v61, s4, 14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:304
-; SI-NEXT:    v_writelane_b32 v61, s4, 9
+; SI-NEXT:    v_writelane_b32 v61, s4, 15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
-; SI-NEXT:    v_writelane_b32 v61, s4, 10
+; SI-NEXT:    v_writelane_b32 v61, s4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:296
-; SI-NEXT:    v_writelane_b32 v61, s4, 11
+; SI-NEXT:    v_writelane_b32 v61, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:292
-; SI-NEXT:    v_writelane_b32 v61, s4, 12
+; SI-NEXT:    v_writelane_b32 v61, s4, 18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:288
-; SI-NEXT:    v_writelane_b32 v61, s4, 13
+; SI-NEXT:    v_writelane_b32 v61, s4, 19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:284
-; SI-NEXT:    v_writelane_b32 v61, s4, 14
+; SI-NEXT:    v_writelane_b32 v61, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:280
-; SI-NEXT:    v_writelane_b32 v61, s4, 15
+; SI-NEXT:    v_writelane_b32 v61, s4, 21
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:276
-; SI-NEXT:    v_writelane_b32 v61, s4, 16
+; SI-NEXT:    v_writelane_b32 v61, s4, 22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:272
-; SI-NEXT:    v_writelane_b32 v61, s4, 17
+; SI-NEXT:    v_writelane_b32 v61, s4, 23
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:268
-; SI-NEXT:    v_writelane_b32 v61, s4, 18
+; SI-NEXT:    v_writelane_b32 v61, s4, 24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:264
-; SI-NEXT:    v_writelane_b32 v61, s4, 19
+; SI-NEXT:    v_writelane_b32 v61, s4, 25
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:260
-; SI-NEXT:    v_writelane_b32 v61, s4, 20
+; SI-NEXT:    v_writelane_b32 v61, s4, 26
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:256
-; SI-NEXT:    v_writelane_b32 v61, s4, 21
+; SI-NEXT:    v_writelane_b32 v61, s4, 27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
-; SI-NEXT:    v_writelane_b32 v61, s4, 22
+; SI-NEXT:    v_writelane_b32 v61, s4, 28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:248
-; SI-NEXT:    v_writelane_b32 v61, s4, 23
+; SI-NEXT:    v_writelane_b32 v61, s4, 29
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:244
-; SI-NEXT:    v_writelane_b32 v61, s4, 24
+; SI-NEXT:    v_writelane_b32 v61, s4, 30
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:240
-; SI-NEXT:    v_writelane_b32 v61, s4, 25
+; SI-NEXT:    v_writelane_b32 v61, s4, 31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:236
-; SI-NEXT:    v_writelane_b32 v61, s4, 26
+; SI-NEXT:    v_writelane_b32 v61, s4, 32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:232
-; SI-NEXT:    v_writelane_b32 v61, s4, 27
+; SI-NEXT:    v_writelane_b32 v61, s4, 33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:228
-; SI-NEXT:    v_writelane_b32 v61, s4, 28
+; SI-NEXT:    v_writelane_b32 v61, s4, 34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:224
-; SI-NEXT:    v_writelane_b32 v61, s4, 29
+; SI-NEXT:    v_writelane_b32 v61, s4, 35
+; SI-NEXT:    v_writelane_b32 v61, s73, 36
+; SI-NEXT:    v_writelane_b32 v61, s10, 37
+; SI-NEXT:    v_writelane_b32 v61, s72, 38
+; SI-NEXT:    v_writelane_b32 v61, s18, 39
+; SI-NEXT:    v_writelane_b32 v61, s60, 40
+; SI-NEXT:    v_writelane_b32 v61, s31, 41
+; SI-NEXT:    v_writelane_b32 v61, s36, 42
+; SI-NEXT:    v_writelane_b32 v61, s99, 43
+; SI-NEXT:    v_writelane_b32 v61, s94, 44
+; SI-NEXT:    v_writelane_b32 v61, s38, 45
+; SI-NEXT:    v_writelane_b32 v61, s91, 46
+; SI-NEXT:    v_writelane_b32 v61, s88, 47
+; SI-NEXT:    v_writelane_b32 v61, s90, 48
+; SI-NEXT:    v_writelane_b32 v61, s16, 49
+; SI-NEXT:    v_writelane_b32 v61, s24, 50
+; SI-NEXT:    v_writelane_b32 v61, s8, 51
+; SI-NEXT:    v_writelane_b32 v61, s27, 52
+; SI-NEXT:    v_writelane_b32 v61, s9, 53
+; SI-NEXT:    v_writelane_b32 v61, s79, 54
+; SI-NEXT:    v_writelane_b32 v61, s13, 55
+; SI-NEXT:    v_writelane_b32 v61, s40, 56
+; SI-NEXT:    v_writelane_b32 v61, s42, 57
+; SI-NEXT:    v_writelane_b32 v61, s43, 58
+; SI-NEXT:    v_writelane_b32 v61, s44, 59
+; SI-NEXT:    v_writelane_b32 v61, s78, 60
+; SI-NEXT:    v_writelane_b32 v61, s37, 61
+; SI-NEXT:    v_writelane_b32 v61, s28, 62
+; SI-NEXT:    v_writelane_b32 v61, s7, 63
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s93, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v61, s4, 30
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s68, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:216
-; SI-NEXT:    v_writelane_b32 v61, s4, 31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s89, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212
-; SI-NEXT:    v_writelane_b32 v61, s4, 32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s16, v31
+; SI-NEXT:    v_readfirstlane_b32 s30, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:208
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s34, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:204
-; SI-NEXT:    v_writelane_b32 v61, s4, 33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s89, v31
+; SI-NEXT:    v_readfirstlane_b32 s39, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s53, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:196
-; SI-NEXT:    v_writelane_b32 v61, s4, 34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s73, v31
+; SI-NEXT:    v_readfirstlane_b32 s45, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:192
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s81, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:188
-; SI-NEXT:    v_writelane_b32 v61, s4, 35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s72, v31
+; SI-NEXT:    v_readfirstlane_b32 s66, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:184
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s40, v31
+; SI-NEXT:    v_readfirstlane_b32 s21, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s21, v31
+; SI-NEXT:    v_readfirstlane_b32 s69, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:176
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s85, v31
+; SI-NEXT:    v_readfirstlane_b32 s97, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:172
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s81, v31
+; SI-NEXT:    v_readfirstlane_b32 s25, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:168
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s97, v31
+; SI-NEXT:    v_readfirstlane_b32 s85, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:164
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s7, v31
+; SI-NEXT:    v_readfirstlane_b32 s29, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s11, v31
+; SI-NEXT:    v_readfirstlane_b32 s14, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:156
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s41, v31
+; SI-NEXT:    v_readfirstlane_b32 s11, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:152
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s47, v31
+; SI-NEXT:    v_readfirstlane_b32 s57, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s58, v31
+; SI-NEXT:    v_readfirstlane_b32 s47, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s76, v31
+; SI-NEXT:    v_readfirstlane_b32 s75, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s29, v31
+; SI-NEXT:    v_readfirstlane_b32 s59, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:136
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s50, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
-; SI-NEXT:    v_writelane_b32 v61, s4, 36
-; SI-NEXT:    v_writelane_b32 v61, s54, 37
-; SI-NEXT:    v_writelane_b32 v61, s10, 38
-; SI-NEXT:    v_writelane_b32 v61, s67, 39
-; SI-NEXT:    v_writelane_b32 v61, s18, 40
-; SI-NEXT:    v_writelane_b32 v61, s61, 41
-; SI-NEXT:    v_writelane_b32 v61, s60, 42
-; SI-NEXT:    v_writelane_b32 v61, s35, 43
-; SI-NEXT:    v_writelane_b32 v61, s22, 44
-; SI-NEXT:    v_writelane_b32 v61, s62, 45
-; SI-NEXT:    v_writelane_b32 v61, s63, 46
-; SI-NEXT:    v_writelane_b32 v61, s39, 47
-; SI-NEXT:    v_writelane_b32 v61, s99, 48
-; SI-NEXT:    v_writelane_b32 v61, s95, 49
-; SI-NEXT:    v_writelane_b32 v61, s31, 50
-; SI-NEXT:    v_writelane_b32 v61, s24, 51
-; SI-NEXT:    v_writelane_b32 v61, s38, 52
-; SI-NEXT:    v_writelane_b32 v61, s36, 53
-; SI-NEXT:    v_writelane_b32 v61, s8, 54
-; SI-NEXT:    v_writelane_b32 v61, s27, 55
-; SI-NEXT:    v_writelane_b32 v61, s9, 56
-; SI-NEXT:    v_writelane_b32 v61, s79, 57
-; SI-NEXT:    v_writelane_b32 v61, s13, 58
-; SI-NEXT:    v_writelane_b32 v61, s15, 59
-; SI-NEXT:    v_writelane_b32 v61, s42, 60
-; SI-NEXT:    v_writelane_b32 v61, s43, 61
-; SI-NEXT:    v_writelane_b32 v61, s44, 62
-; SI-NEXT:    v_writelane_b32 v61, s45, 63
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s37, v31
+; SI-NEXT:    v_readfirstlane_b32 s76, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s50, v31
+; SI-NEXT:    v_readfirstlane_b32 s54, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s48, v31
+; SI-NEXT:    v_readfirstlane_b32 s35, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s19, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s64, v31
+; SI-NEXT:    v_readfirstlane_b32 s48, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s17, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s65, v31
+; SI-NEXT:    v_readfirstlane_b32 s52, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s71, v31
+; SI-NEXT:    v_readfirstlane_b32 s49, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s70, v31
+; SI-NEXT:    v_readfirstlane_b32 s65, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s83, v31
+; SI-NEXT:    v_readfirstlane_b32 s67, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s49, v31
+; SI-NEXT:    v_readfirstlane_b32 s64, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s80, v31
+; SI-NEXT:    v_readfirstlane_b32 s70, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s82, v31
+; SI-NEXT:    v_readfirstlane_b32 s71, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s87, v31
+; SI-NEXT:    v_readfirstlane_b32 s83, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s84, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s51, v31
+; SI-NEXT:    v_readfirstlane_b32 s55, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s86, v31
+; SI-NEXT:    v_readfirstlane_b32 s80, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s94, v31
+; SI-NEXT:    v_readfirstlane_b32 s62, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s96, v31
+; SI-NEXT:    v_readfirstlane_b32 s82, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s68, v31
+; SI-NEXT:    v_readfirstlane_b32 s63, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s34, v31
+; SI-NEXT:    v_readfirstlane_b32 s87, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s77, v31
+; SI-NEXT:    v_readfirstlane_b32 s61, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s66, v31
+; SI-NEXT:    v_readfirstlane_b32 s86, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s78, v31
+; SI-NEXT:    v_readfirstlane_b32 s22, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s53, v31
+; SI-NEXT:    v_readfirstlane_b32 s51, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s69, v31
+; SI-NEXT:    v_readfirstlane_b32 s74, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s30, v31
+; SI-NEXT:    v_readfirstlane_b32 s20, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s52, v31
+; SI-NEXT:    v_readfirstlane_b32 s26, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s75, v31
+; SI-NEXT:    v_readfirstlane_b32 s92, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s23, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s28, v31
+; SI-NEXT:    v_readfirstlane_b32 s98, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s26, v31
+; SI-NEXT:    v_readfirstlane_b32 s15, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s25, v31
+; SI-NEXT:    v_readfirstlane_b32 s58, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v62, s25, 8
-; SI-NEXT:    v_writelane_b32 v62, s28, 9
+; SI-NEXT:    v_writelane_b32 v62, s58, 4
+; SI-NEXT:    v_writelane_b32 v62, s98, 5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s92, v31
-; SI-NEXT:    v_writelane_b32 v62, s92, 10
-; SI-NEXT:    v_writelane_b32 v62, s75, 11
-; SI-NEXT:    v_writelane_b32 v62, s26, 12
-; SI-NEXT:    v_writelane_b32 v62, s30, 13
-; SI-NEXT:    v_writelane_b32 v62, s23, 14
-; SI-NEXT:    v_writelane_b32 v62, s52, 15
+; SI-NEXT:    v_readfirstlane_b32 s77, v31
+; SI-NEXT:    v_writelane_b32 v62, s77, 6
+; SI-NEXT:    v_writelane_b32 v62, s92, 7
+; SI-NEXT:    v_writelane_b32 v62, s15, 8
+; SI-NEXT:    v_writelane_b32 v62, s20, 9
+; SI-NEXT:    v_writelane_b32 v62, s23, 10
+; SI-NEXT:    v_writelane_b32 v62, s26, 11
+; SI-NEXT:    v_writelane_b32 v62, s48, 12
+; SI-NEXT:    v_writelane_b32 v62, s17, 13
+; SI-NEXT:    v_writelane_b32 v62, s52, 14
+; SI-NEXT:    v_writelane_b32 v62, s65, 15
 ; SI-NEXT:    v_writelane_b32 v62, s64, 16
-; SI-NEXT:    v_writelane_b32 v62, s17, 17
-; SI-NEXT:    v_writelane_b32 v62, s65, 18
-; SI-NEXT:    v_writelane_b32 v62, s70, 19
-; SI-NEXT:    v_writelane_b32 v62, s71, 20
-; SI-NEXT:    v_writelane_b32 v62, s49, 21
-; SI-NEXT:    v_writelane_b32 v62, s83, 22
-; SI-NEXT:    v_writelane_b32 v62, s80, 23
+; SI-NEXT:    v_writelane_b32 v62, s49, 17
+; SI-NEXT:    v_writelane_b32 v62, s67, 18
+; SI-NEXT:    v_writelane_b32 v62, s71, 19
+; SI-NEXT:    v_writelane_b32 v62, s70, 20
+; SI-NEXT:    v_writelane_b32 v62, s84, 21
+; SI-NEXT:    v_writelane_b32 v62, s80, 22
+; SI-NEXT:    v_writelane_b32 v62, s83, 23
 ; SI-NEXT:    v_writelane_b32 v62, s82, 24
-; SI-NEXT:    v_writelane_b32 v62, s84, 25
-; SI-NEXT:    v_writelane_b32 v62, s87, 26
+; SI-NEXT:    v_writelane_b32 v62, s87, 25
+; SI-NEXT:    v_writelane_b32 v62, s51, 26
 ; SI-NEXT:    v_writelane_b32 v62, s86, 27
-; SI-NEXT:    v_writelane_b32 v62, s51, 28
-; SI-NEXT:    v_writelane_b32 v62, s96, 29
-; SI-NEXT:    v_writelane_b32 v62, s34, 30
-; SI-NEXT:    v_writelane_b32 v62, s94, 31
-; SI-NEXT:    v_writelane_b32 v62, s53, 32
-; SI-NEXT:    v_writelane_b32 v62, s66, 33
-; SI-NEXT:    v_writelane_b32 v62, s68, 34
-; SI-NEXT:    v_writelane_b32 v62, s69, 35
-; SI-NEXT:    v_writelane_b32 v62, s77, 36
-; SI-NEXT:    v_writelane_b32 v62, s78, 37
+; SI-NEXT:    v_writelane_b32 v62, s55, 28
+; SI-NEXT:    v_writelane_b32 v62, s62, 29
+; SI-NEXT:    v_writelane_b32 v62, s63, 30
+; SI-NEXT:    v_writelane_b32 v62, s74, 31
+; SI-NEXT:    v_writelane_b32 v62, s61, 32
+; SI-NEXT:    v_writelane_b32 v62, s22, 33
 ; SI-NEXT:    s_cbranch_scc0 .LBB93_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s54, 8
+; SI-NEXT:    s_lshl_b32 s5, s73, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
 ; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s67, 8
+; SI-NEXT:    s_lshl_b32 s5, s72, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    v_readlane_b32 s5, v61, 8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
 ; SI-NEXT:    s_and_b32 s4, s60, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s61, 8
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s35, 8
+; SI-NEXT:    v_readlane_b32 s4, v61, 7
+; SI-NEXT:    v_readlane_b32 s5, v61, 6
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT:    s_and_b32 s4, s63, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s62, 8
+; SI-NEXT:    v_readlane_b32 s4, v61, 5
+; SI-NEXT:    v_readlane_b32 s5, v61, 4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 3
 ; SI-NEXT:    v_readlane_b32 s5, v61, 2
-; SI-NEXT:    s_and_b32 s4, s39, 0xff
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
 ; SI-NEXT:    v_readlane_b32 s4, v61, 1
@@ -182058,466 +182056,490 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s99, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s55, 8
+; SI-NEXT:    s_and_b32 s4, s31, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s6, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s95, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s93, 8
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, s4
+; SI-NEXT:    s_and_b32 s4, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s96, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s31, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s91, 8
+; SI-NEXT:    s_and_b32 s4, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s99, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s88, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s90, 8
+; SI-NEXT:    s_and_b32 s4, s38, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s95, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s98, 8
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, s4
+; SI-NEXT:    s_and_b32 s4, s88, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s91, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s36, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s38, 8
+; SI-NEXT:    s_and_b32 s4, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s90, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s27, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
+; SI-NEXT:    s_and_b32 s4, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s24, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, s4
-; SI-NEXT:    s_and_b32 s4, s79, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s9, 8
+; SI-NEXT:    s_and_b32 s4, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s27, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s13, 8
+; SI-NEXT:    s_and_b32 s4, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s79, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s42, 8
+; SI-NEXT:    s_and_b32 s4, s42, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s40, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, s4
-; SI-NEXT:    s_and_b32 s4, s45, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s44, 8
+; SI-NEXT:    s_and_b32 s4, s44, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s43, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, s4
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s74, 8
+; SI-NEXT:    s_and_b32 s4, s37, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s78, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, s4
-; SI-NEXT:    s_and_b32 s4, s14, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s12, 8
+; SI-NEXT:    s_and_b32 s4, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s28, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, s4
-; SI-NEXT:    s_and_b32 s4, s56, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s46, 8
+; SI-NEXT:    s_and_b32 s4, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s12, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, s4
-; SI-NEXT:    s_and_b32 s4, s59, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s57, 8
+; SI-NEXT:    s_and_b32 s4, s56, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s46, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v23, s4
-; SI-NEXT:    s_and_b32 s4, s92, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s25, 8
+; SI-NEXT:    s_and_b32 s4, s77, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s58, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s28, 8
+; SI-NEXT:    s_and_b32 s4, s15, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s98, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, s4
 ; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s75, 8
+; SI-NEXT:    s_lshl_b32 s5, s92, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, s4
-; SI-NEXT:    s_and_b32 s4, s52, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s30, 8
+; SI-NEXT:    s_and_b32 s4, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s20, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, s4
-; SI-NEXT:    s_and_b32 s4, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s53, 8
+; SI-NEXT:    s_and_b32 s4, s74, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s51, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, s4
-; SI-NEXT:    s_and_b32 s4, s78, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s66, 8
+; SI-NEXT:    s_and_b32 s4, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s86, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, s4
-; SI-NEXT:    s_and_b32 s4, s77, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s34, 8
+; SI-NEXT:    s_and_b32 s4, s61, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s87, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, s4
-; SI-NEXT:    s_and_b32 s4, s68, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s96, 8
+; SI-NEXT:    s_and_b32 s4, s63, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s82, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, s4
-; SI-NEXT:    s_and_b32 s4, s94, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s86, 8
+; SI-NEXT:    s_and_b32 s4, s62, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s80, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, s4
-; SI-NEXT:    s_and_b32 s4, s51, 0xff
+; SI-NEXT:    s_and_b32 s4, s55, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s84, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s4
-; SI-NEXT:    s_and_b32 s4, s87, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s82, 8
+; SI-NEXT:    s_and_b32 s4, s83, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s71, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, s4
-; SI-NEXT:    s_and_b32 s4, s80, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s49, 8
+; SI-NEXT:    s_and_b32 s4, s70, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s64, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, s4
-; SI-NEXT:    s_and_b32 s4, s83, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s70, 8
+; SI-NEXT:    s_and_b32 s4, s67, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s65, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, s4
-; SI-NEXT:    s_and_b32 s4, s71, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s65, 8
+; SI-NEXT:    s_and_b32 s4, s49, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s52, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, s4
 ; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s64, 8
+; SI-NEXT:    s_lshl_b32 s5, s48, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, s4
 ; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s48, 8
+; SI-NEXT:    s_lshl_b32 s5, s35, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, s4
-; SI-NEXT:    s_and_b32 s4, s50, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s37, 8
+; SI-NEXT:    s_and_b32 s4, s54, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s76, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s8, v61, 36
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, s4
-; SI-NEXT:    s_and_b32 s4, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s29, 8
+; SI-NEXT:    s_and_b32 s4, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s59, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s4
-; SI-NEXT:    s_and_b32 s4, s76, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s58, 8
+; SI-NEXT:    s_and_b32 s4, s75, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s47, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, s4
-; SI-NEXT:    s_and_b32 s4, s47, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s41, 8
+; SI-NEXT:    s_and_b32 s4, s57, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s11, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s7, 8
+; SI-NEXT:    s_and_b32 s4, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s29, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, s4
-; SI-NEXT:    s_and_b32 s4, s97, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s81, 8
+; SI-NEXT:    s_and_b32 s4, s85, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, s4
-; SI-NEXT:    s_and_b32 s4, s85, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_and_b32 s4, s97, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s69, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, s4
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s72, 8
+; SI-NEXT:    s_and_b32 s4, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s66, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s69, v61, 35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, s4
-; SI-NEXT:    s_and_b32 s4, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s73, 8
+; SI-NEXT:    s_and_b32 s4, s81, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s45, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s68, v61, 34
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, s4
-; SI-NEXT:    s_and_b32 s4, s68, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s89, 8
+; SI-NEXT:    s_and_b32 s4, s53, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s39, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s66, v61, 33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s4
-; SI-NEXT:    s_and_b32 s4, s66, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s16, 8
+; SI-NEXT:    s_and_b32 s4, s34, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s30, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s53, v61, 32
-; SI-NEXT:    v_readlane_b32 s94, v61, 31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, s4
-; SI-NEXT:    s_and_b32 s4, s53, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s94, 8
+; SI-NEXT:    s_and_b32 s4, s89, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s68, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s34, v61, 30
-; SI-NEXT:    v_readlane_b32 s96, v61, 29
+; SI-NEXT:    v_readlane_b32 s5, v61, 35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, s4
-; SI-NEXT:    s_and_b32 s4, s34, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s96, 8
+; SI-NEXT:    s_and_b32 s4, s93, 0xff
+; SI-NEXT:    s_mov_b32 s99, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s51, v61, 28
-; SI-NEXT:    v_readlane_b32 s86, v61, 27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, s4
-; SI-NEXT:    s_and_b32 s4, s51, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s86, 8
+; SI-NEXT:    v_readlane_b32 s4, v61, 34
+; SI-NEXT:    v_readlane_b32 s5, v61, 33
+; SI-NEXT:    s_mov_b32 s31, s6
+; SI-NEXT:    s_mov_b32 s6, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s55, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 32
+; SI-NEXT:    v_readlane_b32 s5, v61, 31
+; SI-NEXT:    s_mov_b32 s86, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s51, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s87, v61, 26
-; SI-NEXT:    v_readlane_b32 s84, v61, 25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, s4
-; SI-NEXT:    s_and_b32 s4, s87, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s84, 8
+; SI-NEXT:    v_readlane_b32 s4, v61, 30
+; SI-NEXT:    v_readlane_b32 s5, v61, 29
+; SI-NEXT:    s_mov_b32 s36, s96
+; SI-NEXT:    s_mov_b32 s96, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s82, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s82, v61, 24
-; SI-NEXT:    v_readlane_b32 s80, v61, 23
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, s4
-; SI-NEXT:    s_and_b32 s4, s82, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s80, 8
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 28
+; SI-NEXT:    v_readlane_b32 s5, v61, 27
+; SI-NEXT:    s_mov_b32 s83, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s87, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s83, v61, 22
-; SI-NEXT:    v_readlane_b32 s49, v61, 21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, s4
-; SI-NEXT:    s_and_b32 s4, s83, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s49, 8
+; SI-NEXT:    v_readlane_b32 s4, v61, 26
+; SI-NEXT:    v_readlane_b32 s5, v61, 25
+; SI-NEXT:    s_mov_b32 s84, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s80, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s71, v61, 20
-; SI-NEXT:    v_readlane_b32 s70, v61, 19
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, s4
-; SI-NEXT:    s_and_b32 s4, s71, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s70, 8
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 24
+; SI-NEXT:    v_readlane_b32 s5, v61, 23
+; SI-NEXT:    s_mov_b32 s71, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s70, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s65, v61, 18
-; SI-NEXT:    v_readlane_b32 s54, v61, 17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, s4
-; SI-NEXT:    s_and_b32 s4, s65, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s54, 8
-; SI-NEXT:    s_mov_b32 s17, s19
-; SI-NEXT:    s_mov_b32 s19, s50
+; SI-NEXT:    v_readlane_b32 s4, v61, 22
+; SI-NEXT:    v_readlane_b32 s5, v61, 21
+; SI-NEXT:    s_mov_b32 s49, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s67, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s67, v61, 16
-; SI-NEXT:    v_readlane_b32 s50, v61, 15
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, s4
-; SI-NEXT:    s_and_b32 s4, s67, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s50, 8
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 20
+; SI-NEXT:    v_readlane_b32 s5, v61, 19
+; SI-NEXT:    s_mov_b32 s65, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s64, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s64, v61, 14
-; SI-NEXT:    v_readlane_b32 s52, v61, 13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, s4
-; SI-NEXT:    s_and_b32 s4, s64, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s52, 8
-; SI-NEXT:    s_mov_b32 s23, s48
+; SI-NEXT:    v_readlane_b32 s4, v61, 18
+; SI-NEXT:    v_readlane_b32 s5, v61, 17
+; SI-NEXT:    s_mov_b32 s17, s19
+; SI-NEXT:    s_mov_b32 s19, s54
+; SI-NEXT:    s_mov_b32 s26, s50
+; SI-NEXT:    s_mov_b32 s54, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s50, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s35, v61, 12
-; SI-NEXT:    v_readlane_b32 s48, v61, 11
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, s4
-; SI-NEXT:    s_and_b32 s4, s35, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s48, 8
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 16
+; SI-NEXT:    v_readlane_b32 s5, v61, 15
+; SI-NEXT:    s_mov_b32 s23, s35
+; SI-NEXT:    s_mov_b32 s35, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s48, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s30, v61, 10
-; SI-NEXT:    v_readlane_b32 s39, v61, 9
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT:    s_and_b32 s4, s30, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s39, 8
-; SI-NEXT:    s_mov_b32 s26, s37
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s37, v61, 8
-; SI-NEXT:    v_readlane_b32 s75, v61, 7
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, s4
-; SI-NEXT:    s_and_b32 s4, s37, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s75, 8
+; SI-NEXT:    v_readlane_b32 s4, v61, 14
+; SI-NEXT:    v_readlane_b32 s5, v61, 13
+; SI-NEXT:    s_mov_b32 s15, s75
+; SI-NEXT:    s_mov_b32 s52, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s75, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s92, v61, 6
-; SI-NEXT:    v_readlane_b32 s77, v61, 5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
-; SI-NEXT:    s_and_b32 s4, s92, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s77, 8
-; SI-NEXT:    s_mov_b32 s28, s29
-; SI-NEXT:    s_mov_b32 s29, s76
+; SI-NEXT:    v_readlane_b32 s4, v61, 12
+; SI-NEXT:    v_readlane_b32 s5, v61, 11
+; SI-NEXT:    s_mov_b32 s88, s30
+; SI-NEXT:    s_mov_b32 s30, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s92, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s78, v61, 4
-; SI-NEXT:    v_readlane_b32 s76, v61, 3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s78, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s76, 8
+; SI-NEXT:    v_readlane_b32 s4, v61, 10
+; SI-NEXT:    v_readlane_b32 s5, v61, 9
+; SI-NEXT:    s_mov_b32 s13, s39
+; SI-NEXT:    s_mov_b32 s39, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_mov_b32 s77, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_mov_b32 s99, s55
-; SI-NEXT:    s_mov_b32 s20, s88
-; SI-NEXT:    s_mov_b32 s24, s98
-; SI-NEXT:    s_mov_b32 s59, s58
+; SI-NEXT:    s_mov_b32 s38, s95
+; SI-NEXT:    s_mov_b32 s20, s76
+; SI-NEXT:    s_mov_b32 s98, s59
 ; SI-NEXT:    s_mov_b32 s56, s47
-; SI-NEXT:    s_mov_b32 s46, s41
+; SI-NEXT:    s_mov_b32 s58, s57
 ; SI-NEXT:    s_mov_b32 s12, s11
-; SI-NEXT:    s_mov_b32 s11, s7
-; SI-NEXT:    s_mov_b32 s7, s97
-; SI-NEXT:    s_mov_b32 s97, s81
-; SI-NEXT:    s_mov_b32 s81, s85
-; SI-NEXT:    s_mov_b32 s6, s40
-; SI-NEXT:    s_mov_b32 s40, s72
-; SI-NEXT:    s_mov_b32 s45, s73
-; SI-NEXT:    s_mov_b32 s15, s89
+; SI-NEXT:    s_mov_b32 s41, s14
+; SI-NEXT:    s_mov_b32 s28, s29
+; SI-NEXT:    s_mov_b32 s7, s85
+; SI-NEXT:    s_mov_b32 s29, s25
+; SI-NEXT:    s_mov_b32 s85, s97
+; SI-NEXT:    s_mov_b32 s25, s69
+; SI-NEXT:    s_mov_b32 s97, s21
+; SI-NEXT:    s_mov_b32 s37, s66
+; SI-NEXT:    s_mov_b32 s69, s81
+; SI-NEXT:    s_mov_b32 s44, s45
+; SI-NEXT:    s_mov_b32 s66, s53
+; SI-NEXT:    s_mov_b32 s53, s34
+; SI-NEXT:    s_mov_b32 s34, s89
+; SI-NEXT:    s_mov_b32 s94, s68
+; SI-NEXT:    s_mov_b32 s89, s93
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT:    s_mov_b32 s55, s93
-; SI-NEXT:    s_mov_b32 s95, s91
-; SI-NEXT:    s_mov_b32 s31, s90
 ; SI-NEXT:    s_cbranch_execnz .LBB93_3
 ; SI-NEXT:  .LBB93_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s4, s78, 3
+; SI-NEXT:    s_add_i32 s4, s39, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s76, 8
+; SI-NEXT:    s_lshl_b32 s5, s77, 8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s5, s92, 3
+; SI-NEXT:    s_add_i32 s5, s30, 3
 ; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    s_lshl_b32 vcc_lo, s77, 8
+; SI-NEXT:    s_lshl_b32 vcc_lo, s92, 8
 ; SI-NEXT:    s_or_b32 s5, vcc_lo, s5
-; SI-NEXT:    s_add_i32 vcc_lo, s37, 3
+; SI-NEXT:    s_add_i32 vcc_lo, s52, 3
 ; SI-NEXT:    s_and_b32 vcc_lo, vcc_lo, 0xff
 ; SI-NEXT:    s_lshl_b32 vcc_hi, s75, 8
 ; SI-NEXT:    s_or_b32 vcc_lo, vcc_hi, vcc_lo
-; SI-NEXT:    s_add_i32 vcc_hi, s30, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s35, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s60, s39, 8
+; SI-NEXT:    s_lshl_b32 s60, s48, 8
 ; SI-NEXT:    s_or_b32 s60, s60, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s35, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s54, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s61, s48, 8
+; SI-NEXT:    s_lshl_b32 s61, s50, 8
 ; SI-NEXT:    s_or_b32 s61, s61, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s64, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s65, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s62, s52, 8
+; SI-NEXT:    s_lshl_b32 s62, s64, 8
 ; SI-NEXT:    s_or_b32 s62, s62, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s67, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s49, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s63, s50, 8
+; SI-NEXT:    s_lshl_b32 s63, s67, 8
 ; SI-NEXT:    s_or_b32 s10, s63, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s65, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s71, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s72, s54, 8
+; SI-NEXT:    s_lshl_b32 s72, s70, 8
 ; SI-NEXT:    s_or_b32 s72, s72, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s71, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s84, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s73, s70, 8
+; SI-NEXT:    s_lshl_b32 s73, s80, 8
 ; SI-NEXT:    s_or_b32 s73, s73, vcc_hi
 ; SI-NEXT:    s_add_i32 vcc_hi, s83, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s74, s49, 8
+; SI-NEXT:    s_lshl_b32 s74, s87, 8
 ; SI-NEXT:    s_or_b32 s74, s74, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s82, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s96, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s75, s80, 8
+; SI-NEXT:    s_lshl_b32 s75, s82, 8
 ; SI-NEXT:    s_or_b32 s75, s75, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s87, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s86, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s76, s84, 8
+; SI-NEXT:    s_lshl_b32 s76, s51, 8
 ; SI-NEXT:    s_or_b32 s76, s76, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s51, 3
-; SI-NEXT:    s_add_i32 s93, s53, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s6, 3
+; SI-NEXT:    s_add_i32 s93, s34, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s77, s86, 8
-; SI-NEXT:    s_add_i32 s89, s34, 3
+; SI-NEXT:    s_lshl_b32 s77, s55, 8
+; SI-NEXT:    s_add_i32 s89, s89, 3
 ; SI-NEXT:    s_and_b32 s93, s93, 0xff
 ; SI-NEXT:    s_lshl_b32 s78, s94, 8
-; SI-NEXT:    s_add_i32 s34, s66, 3
+; SI-NEXT:    s_add_i32 s34, s53, 3
 ; SI-NEXT:    s_or_b32 s77, s77, vcc_hi
 ; SI-NEXT:    s_and_b32 s89, s89, 0xff
-; SI-NEXT:    s_lshl_b32 vcc_hi, s96, 8
+; SI-NEXT:    s_lshl_b32 vcc_hi, s99, 8
 ; SI-NEXT:    s_or_b32 s22, s78, s93
 ; SI-NEXT:    s_and_b32 s93, s34, 0xff
-; SI-NEXT:    s_lshl_b32 s92, s16, 8
-; SI-NEXT:    s_add_i32 s53, s68, 3
+; SI-NEXT:    s_lshl_b32 s92, s88, 8
+; SI-NEXT:    s_add_i32 s53, s66, 3
 ; SI-NEXT:    s_or_b32 s89, vcc_hi, s89
 ; SI-NEXT:    s_or_b32 s92, s92, s93
 ; SI-NEXT:    s_and_b32 s93, s53, 0xff
-; SI-NEXT:    s_lshl_b32 vcc_hi, s15, 8
+; SI-NEXT:    s_lshl_b32 vcc_hi, s13, 8
 ; SI-NEXT:    s_add_i32 s66, s69, 3
 ; SI-NEXT:    s_or_b32 s93, vcc_hi, s93
 ; SI-NEXT:    s_and_b32 vcc_hi, s66, 0xff
-; SI-NEXT:    s_lshl_b32 s34, s45, 8
-; SI-NEXT:    s_add_i32 s68, s6, 3
+; SI-NEXT:    s_lshl_b32 s34, s44, 8
+; SI-NEXT:    s_add_i32 s68, s97, 3
 ; SI-NEXT:    s_or_b32 vcc_hi, s34, vcc_hi
 ; SI-NEXT:    s_and_b32 s34, s68, 0xff
-; SI-NEXT:    s_lshl_b32 s39, s40, 8
-; SI-NEXT:    s_add_i32 s69, s81, 3
+; SI-NEXT:    s_lshl_b32 s39, s37, 8
+; SI-NEXT:    s_add_i32 s69, s85, 3
 ; SI-NEXT:    s_or_b32 s34, s39, s34
 ; SI-NEXT:    s_and_b32 s39, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s52, s21, 8
+; SI-NEXT:    s_lshl_b32 s52, s25, 8
 ; SI-NEXT:    s_add_i32 s81, s7, 3
 ; SI-NEXT:    s_or_b32 s39, s52, s39
 ; SI-NEXT:    s_and_b32 s52, s81, 0xff
-; SI-NEXT:    s_lshl_b32 s53, s97, 8
-; SI-NEXT:    s_add_i32 s85, s12, 3
+; SI-NEXT:    s_lshl_b32 s53, s29, 8
+; SI-NEXT:    s_add_i32 s85, s41, 3
 ; SI-NEXT:    s_or_b32 s52, s53, s52
 ; SI-NEXT:    s_and_b32 s53, s85, 0xff
-; SI-NEXT:    s_lshl_b32 s64, s11, 8
-; SI-NEXT:    s_add_i32 s97, s56, 3
+; SI-NEXT:    s_lshl_b32 s64, s28, 8
+; SI-NEXT:    s_add_i32 s97, s58, 3
 ; SI-NEXT:    s_or_b32 s53, s64, s53
 ; SI-NEXT:    s_and_b32 s64, s97, 0xff
-; SI-NEXT:    s_lshl_b32 s66, s46, 8
-; SI-NEXT:    s_add_i32 s21, s29, 3
+; SI-NEXT:    s_lshl_b32 s66, s12, 8
+; SI-NEXT:    s_add_i32 s21, s15, 3
 ; SI-NEXT:    s_or_b32 s64, s66, s64
 ; SI-NEXT:    s_and_b32 s21, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s66, s59, 8
-; SI-NEXT:    s_add_i32 s25, s8, 3
+; SI-NEXT:    s_lshl_b32 s66, s56, 8
+; SI-NEXT:    s_add_i32 s25, s26, 3
 ; SI-NEXT:    s_or_b32 s66, s66, s21
 ; SI-NEXT:    s_and_b32 s21, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s28, 8
+; SI-NEXT:    s_lshl_b32 s6, s98, 8
 ; SI-NEXT:    s_add_i32 s29, s19, 3
 ; SI-NEXT:    s_or_b32 s67, s6, s21
 ; SI-NEXT:    s_and_b32 s6, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s18, s26, 8
+; SI-NEXT:    s_lshl_b32 s18, s20, 8
 ; SI-NEXT:    s_add_i32 s28, s17, 3
 ; SI-NEXT:    s_or_b32 s68, s18, s6
 ; SI-NEXT:    s_and_b32 s6, s28, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s23, 8
 ; SI-NEXT:    s_or_b32 s69, s18, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 17
+; SI-NEXT:    v_readlane_b32 s6, v62, 13
 ; SI-NEXT:    s_add_i32 s7, s6, 3
-; SI-NEXT:    v_readlane_b32 s16, v62, 15
+; SI-NEXT:    v_readlane_b32 s16, v62, 11
 ; SI-NEXT:    s_and_b32 s6, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v62, 16
+; SI-NEXT:    v_readlane_b32 s7, v62, 12
 ; SI-NEXT:    s_add_i32 s27, s16, 3
-; SI-NEXT:    v_readlane_b32 s16, v62, 13
+; SI-NEXT:    v_readlane_b32 s16, v62, 9
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_lshl_b32 s23, s16, 8
-; SI-NEXT:    v_readlane_b32 s16, v62, 14
-; SI-NEXT:    s_mov_b32 s91, s24
+; SI-NEXT:    v_readlane_b32 s16, v62, 10
 ; SI-NEXT:    s_or_b32 s70, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 20
+; SI-NEXT:    v_readlane_b32 s6, v62, 17
 ; SI-NEXT:    s_add_i32 s24, s16, 3
-; SI-NEXT:    v_readlane_b32 s16, v62, 11
+; SI-NEXT:    v_readlane_b32 s16, v62, 7
 ; SI-NEXT:    s_add_i32 s11, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 18
+; SI-NEXT:    v_readlane_b32 s7, v62, 14
 ; SI-NEXT:    s_lshl_b32 s19, s16, 8
-; SI-NEXT:    v_readlane_b32 s16, v62, 12
-; SI-NEXT:    s_mov_b32 s90, s20
+; SI-NEXT:    v_readlane_b32 s16, v62, 8
 ; SI-NEXT:    s_and_b32 s6, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_add_i32 s20, s16, 3
-; SI-NEXT:    v_readlane_b32 s16, v62, 9
+; SI-NEXT:    v_readlane_b32 s16, v62, 5
 ; SI-NEXT:    s_or_b32 s71, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 22
+; SI-NEXT:    v_readlane_b32 s6, v62, 18
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s16, 8
-; SI-NEXT:    v_readlane_b32 s16, v62, 10
+; SI-NEXT:    v_readlane_b32 s16, v62, 6
 ; SI-NEXT:    s_add_i32 s12, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 19
+; SI-NEXT:    v_readlane_b32 s7, v62, 15
 ; SI-NEXT:    s_or_b32 s17, s17, s20
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    v_readlane_b32 s20, v62, 8
+; SI-NEXT:    v_readlane_b32 s20, v62, 4
 ; SI-NEXT:    s_and_b32 s6, s12, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s20, s20, 8
 ; SI-NEXT:    s_or_b32 s81, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 23
+; SI-NEXT:    v_readlane_b32 s6, v62, 20
 ; SI-NEXT:    s_and_b32 s24, s24, 0xff
 ; SI-NEXT:    s_or_b32 s16, s20, s16
-; SI-NEXT:    v_readlane_b32 s20, v62, 7
+; SI-NEXT:    v_readlane_b32 s20, v62, 3
 ; SI-NEXT:    s_add_i32 s14, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 21
+; SI-NEXT:    v_readlane_b32 s7, v62, 16
 ; SI-NEXT:    s_or_b32 s19, s19, s24
 ; SI-NEXT:    s_add_i32 s98, s20, 3
-; SI-NEXT:    v_readlane_b32 s24, v62, 6
+; SI-NEXT:    v_readlane_b32 s24, v62, 2
 ; SI-NEXT:    s_and_b32 s6, s14, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s20, s98, 0xff
 ; SI-NEXT:    s_lshl_b32 s24, s24, 8
 ; SI-NEXT:    s_or_b32 s83, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 26
+; SI-NEXT:    v_readlane_b32 s6, v62, 23
 ; SI-NEXT:    s_and_b32 s27, s27, 0xff
 ; SI-NEXT:    s_or_b32 s20, s24, s20
-; SI-NEXT:    v_readlane_b32 s24, v62, 5
+; SI-NEXT:    v_readlane_b32 s24, v62, 1
 ; SI-NEXT:    s_add_i32 s41, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 24
+; SI-NEXT:    v_readlane_b32 s7, v62, 19
 ; SI-NEXT:    s_or_b32 s23, s23, s27
 ; SI-NEXT:    s_add_i32 s86, s24, 3
-; SI-NEXT:    v_readlane_b32 s27, v62, 4
+; SI-NEXT:    v_readlane_b32 s27, v62, 0
 ; SI-NEXT:    s_and_b32 s6, s41, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s24, s86, 0xff
@@ -182525,123 +182547,126 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_or_b32 s85, s7, s6
 ; SI-NEXT:    v_readlane_b32 s6, v62, 28
 ; SI-NEXT:    s_or_b32 s24, s27, s24
-; SI-NEXT:    v_readlane_b32 s27, v62, 3
+; SI-NEXT:    v_readlane_b32 s27, v61, 63
 ; SI-NEXT:    s_add_i32 s46, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 25
+; SI-NEXT:    v_readlane_b32 s7, v62, 21
 ; SI-NEXT:    s_add_i32 s12, s73, 0x300
 ; SI-NEXT:    s_add_i32 s82, s27, 3
-; SI-NEXT:    v_readlane_b32 s73, v62, 2
+; SI-NEXT:    v_readlane_b32 s73, v61, 62
 ; SI-NEXT:    s_and_b32 s6, s46, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s27, s82, 0xff
 ; SI-NEXT:    s_lshl_b32 s73, s73, 8
 ; SI-NEXT:    s_or_b32 s96, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 31
+; SI-NEXT:    v_readlane_b32 s6, v62, 29
 ; SI-NEXT:    s_or_b32 s27, s73, s27
-; SI-NEXT:    v_readlane_b32 s73, v62, 1
+; SI-NEXT:    v_readlane_b32 s73, v61, 61
 ; SI-NEXT:    s_add_i32 s47, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 27
+; SI-NEXT:    v_readlane_b32 s7, v62, 22
 ; SI-NEXT:    s_add_i32 s13, s74, 0x300
 ; SI-NEXT:    s_add_i32 s65, s73, 3
-; SI-NEXT:    v_readlane_b32 s74, v62, 0
+; SI-NEXT:    v_readlane_b32 s74, v61, 60
 ; SI-NEXT:    s_and_b32 s6, s47, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s73, s65, 0xff
 ; SI-NEXT:    s_lshl_b32 s74, s74, 8
 ; SI-NEXT:    s_or_b32 s97, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 34
+; SI-NEXT:    v_readlane_b32 s6, v62, 30
 ; SI-NEXT:    s_or_b32 s73, s74, s73
-; SI-NEXT:    v_readlane_b32 s74, v61, 63
+; SI-NEXT:    v_readlane_b32 s74, v61, 59
 ; SI-NEXT:    s_add_i32 s56, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 29
+; SI-NEXT:    v_readlane_b32 s7, v62, 24
 ; SI-NEXT:    s_add_i32 s14, s75, 0x300
 ; SI-NEXT:    s_add_i32 s54, s74, 3
-; SI-NEXT:    v_readlane_b32 s75, v61, 62
+; SI-NEXT:    v_readlane_b32 s75, v61, 58
 ; SI-NEXT:    s_and_b32 s6, s56, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s74, s54, 0xff
 ; SI-NEXT:    s_lshl_b32 s75, s75, 8
 ; SI-NEXT:    s_or_b32 s63, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 36
+; SI-NEXT:    v_readlane_b32 s6, v62, 32
 ; SI-NEXT:    s_or_b32 s74, s75, s74
-; SI-NEXT:    v_readlane_b32 s75, v61, 61
+; SI-NEXT:    v_readlane_b32 s75, v61, 57
 ; SI-NEXT:    s_add_i32 s58, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 30
+; SI-NEXT:    v_readlane_b32 s7, v62, 25
 ; SI-NEXT:    s_add_i32 s15, s76, 0x300
 ; SI-NEXT:    s_add_i32 s50, s75, 3
-; SI-NEXT:    v_readlane_b32 s76, v61, 60
+; SI-NEXT:    v_readlane_b32 s76, v61, 56
 ; SI-NEXT:    s_and_b32 s6, s58, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s75, s50, 0xff
 ; SI-NEXT:    s_lshl_b32 s76, s76, 8
 ; SI-NEXT:    s_or_b32 s79, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 37
+; SI-NEXT:    v_readlane_b32 s6, v62, 33
 ; SI-NEXT:    s_or_b32 s75, s76, s75
-; SI-NEXT:    v_readlane_b32 s76, v61, 59
+; SI-NEXT:    v_readlane_b32 s76, v61, 55
 ; SI-NEXT:    s_add_i32 s59, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 33
+; SI-NEXT:    v_readlane_b32 s7, v62, 27
 ; SI-NEXT:    s_add_i32 s18, s77, 0x300
 ; SI-NEXT:    s_add_i32 s48, s76, 3
-; SI-NEXT:    v_readlane_b32 s77, v61, 58
+; SI-NEXT:    v_readlane_b32 s77, v61, 54
 ; SI-NEXT:    s_and_b32 s6, s59, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s76, s48, 0xff
 ; SI-NEXT:    s_lshl_b32 s77, s77, 8
 ; SI-NEXT:    s_or_b32 s78, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 35
+; SI-NEXT:    v_readlane_b32 s6, v62, 31
 ; SI-NEXT:    s_or_b32 s76, s77, s76
-; SI-NEXT:    v_readlane_b32 s77, v61, 57
+; SI-NEXT:    v_readlane_b32 s77, v61, 53
 ; SI-NEXT:    s_add_i32 s57, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 32
+; SI-NEXT:    v_readlane_b32 s7, v62, 26
 ; SI-NEXT:    s_add_i32 s11, s72, 0x300
 ; SI-NEXT:    s_add_i32 s72, s79, 0x300
 ; SI-NEXT:    s_add_i32 s37, s77, 3
-; SI-NEXT:    v_readlane_b32 s79, v61, 56
+; SI-NEXT:    v_readlane_b32 s79, v61, 52
 ; SI-NEXT:    s_and_b32 s6, s57, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s77, s37, 0xff
 ; SI-NEXT:    s_lshl_b32 s79, s79, 8
 ; SI-NEXT:    s_or_b32 s88, s7, s6
 ; SI-NEXT:    s_or_b32 s77, s79, s77
-; SI-NEXT:    v_readlane_b32 s79, v61, 55
+; SI-NEXT:    v_readlane_b32 s79, v61, 51
 ; SI-NEXT:    s_add_i32 s21, s89, 0x300
 ; SI-NEXT:    s_add_i32 s89, s88, 0x300
 ; SI-NEXT:    s_add_i32 s35, s79, 3
-; SI-NEXT:    v_readlane_b32 s88, v61, 54
+; SI-NEXT:    v_readlane_b32 s88, v61, 50
 ; SI-NEXT:    s_and_b32 s79, s35, 0xff
 ; SI-NEXT:    s_lshl_b32 s88, s88, 8
-; SI-NEXT:    s_or_b32 s79, s88, s79
-; SI-NEXT:    v_readlane_b32 s88, v61, 53
+; SI-NEXT:    v_readlane_b32 s90, v61, 48
 ; SI-NEXT:    s_add_i32 s25, s92, 0x300
+; SI-NEXT:    s_or_b32 s79, s88, s79
+; SI-NEXT:    v_readlane_b32 s88, v61, 49
+; SI-NEXT:    s_lshl_b32 s92, s90, 8
+; SI-NEXT:    v_readlane_b32 s90, v61, 47
 ; SI-NEXT:    s_add_i32 s30, s88, 3
-; SI-NEXT:    v_readlane_b32 s92, v61, 52
+; SI-NEXT:    s_add_i32 s94, s90, 3
+; SI-NEXT:    v_readlane_b32 s90, v61, 46
 ; SI-NEXT:    s_and_b32 s88, s30, 0xff
-; SI-NEXT:    s_lshl_b32 s92, s92, 8
+; SI-NEXT:    s_lshl_b32 s91, s90, 8
+; SI-NEXT:    v_readlane_b32 s90, v61, 45
 ; SI-NEXT:    s_or_b32 s88, s92, s88
-; SI-NEXT:    v_readlane_b32 s92, v61, 51
-; SI-NEXT:    s_add_i32 s94, s92, 3
 ; SI-NEXT:    s_and_b32 s92, s94, 0xff
-; SI-NEXT:    s_lshl_b32 s91, s91, 8
 ; SI-NEXT:    s_add_i32 s90, s90, 3
 ; SI-NEXT:    s_or_b32 s91, s91, s92
 ; SI-NEXT:    s_and_b32 s90, s90, 0xff
-; SI-NEXT:    s_lshl_b32 s92, s31, 8
+; SI-NEXT:    s_lshl_b32 s92, s38, 8
 ; SI-NEXT:    s_or_b32 s90, s92, s90
-; SI-NEXT:    v_readlane_b32 s92, v61, 50
-; SI-NEXT:    s_add_i32 s92, s92, 3
+; SI-NEXT:    v_readlane_b32 s92, v61, 44
 ; SI-NEXT:    s_add_i32 s26, s93, 0x300
+; SI-NEXT:    s_add_i32 s92, s92, 3
+; SI-NEXT:    v_readlane_b32 s93, v61, 43
 ; SI-NEXT:    s_and_b32 s92, s92, 0xff
-; SI-NEXT:    s_lshl_b32 s93, s95, 8
+; SI-NEXT:    s_lshl_b32 s93, s93, 8
 ; SI-NEXT:    s_or_b32 s92, s93, s92
-; SI-NEXT:    v_readlane_b32 s93, v61, 49
+; SI-NEXT:    v_readlane_b32 s93, v61, 42
 ; SI-NEXT:    s_add_i32 s93, s93, 3
 ; SI-NEXT:    s_and_b32 s93, s93, 0xff
-; SI-NEXT:    s_lshl_b32 s94, s55, 8
+; SI-NEXT:    s_lshl_b32 s94, s36, 8
 ; SI-NEXT:    s_or_b32 s93, s94, s93
-; SI-NEXT:    v_readlane_b32 s94, v61, 48
+; SI-NEXT:    v_readlane_b32 s94, v61, 41
 ; SI-NEXT:    s_add_i32 s94, s94, 3
 ; SI-NEXT:    s_and_b32 s94, s94, 0xff
-; SI-NEXT:    s_lshl_b32 s95, s99, 8
+; SI-NEXT:    s_lshl_b32 s95, s31, 8
 ; SI-NEXT:    s_or_b32 s94, s95, s94
 ; SI-NEXT:    v_readlane_b32 s95, v61, 1
 ; SI-NEXT:    s_add_i32 s95, s95, 3
@@ -182649,30 +182674,30 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_add_i32 s6, vcc_lo, 0x300
 ; SI-NEXT:    s_and_b32 s95, s95, 0xff
 ; SI-NEXT:    s_lshl_b32 vcc_lo, s30, 8
-; SI-NEXT:    v_readlane_b32 s30, v61, 47
+; SI-NEXT:    v_readlane_b32 s30, v61, 3
 ; SI-NEXT:    s_or_b32 s95, vcc_lo, s95
 ; SI-NEXT:    s_add_i32 vcc_lo, s30, 3
 ; SI-NEXT:    v_readlane_b32 s30, v61, 2
 ; SI-NEXT:    s_add_i32 s28, vcc_hi, 0x300
 ; SI-NEXT:    s_and_b32 vcc_lo, vcc_lo, 0xff
 ; SI-NEXT:    s_lshl_b32 vcc_hi, s30, 8
-; SI-NEXT:    v_readlane_b32 s30, v61, 46
+; SI-NEXT:    v_readlane_b32 s30, v61, 5
 ; SI-NEXT:    s_or_b32 vcc_lo, vcc_hi, vcc_lo
 ; SI-NEXT:    s_add_i32 vcc_hi, s30, 3
-; SI-NEXT:    v_readlane_b32 s30, v61, 45
+; SI-NEXT:    v_readlane_b32 s30, v61, 4
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
 ; SI-NEXT:    s_lshl_b32 s30, s30, 8
 ; SI-NEXT:    s_or_b32 vcc_hi, s30, vcc_hi
-; SI-NEXT:    v_readlane_b32 s30, v61, 44
+; SI-NEXT:    v_readlane_b32 s30, v61, 7
 ; SI-NEXT:    s_add_i32 s30, s30, 3
-; SI-NEXT:    v_readlane_b32 s31, v61, 43
+; SI-NEXT:    v_readlane_b32 s31, v61, 6
 ; SI-NEXT:    s_and_b32 s30, s30, 0xff
 ; SI-NEXT:    s_lshl_b32 s31, s31, 8
 ; SI-NEXT:    s_or_b32 s30, s31, s30
-; SI-NEXT:    v_readlane_b32 s31, v61, 42
+; SI-NEXT:    v_readlane_b32 s31, v61, 40
 ; SI-NEXT:    s_add_i32 s29, s34, 0x300
 ; SI-NEXT:    s_add_i32 s31, s31, 3
-; SI-NEXT:    v_readlane_b32 s34, v61, 41
+; SI-NEXT:    v_readlane_b32 s34, v61, 8
 ; SI-NEXT:    s_and_b32 s31, s31, 0xff
 ; SI-NEXT:    s_lshl_b32 s34, s34, 8
 ; SI-NEXT:    s_or_b32 s31, s34, s31
@@ -182680,23 +182705,23 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s31
 ; SI-NEXT:    s_addk_i32 s30, 0x300
 ; SI-NEXT:    s_addk_i32 vcc_hi, 0x300
-; SI-NEXT:    v_readlane_b32 s34, v61, 40
+; SI-NEXT:    v_readlane_b32 s34, v61, 39
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s30
 ; SI-NEXT:    s_add_i32 s34, s34, 3
-; SI-NEXT:    v_readlane_b32 s35, v61, 39
+; SI-NEXT:    v_readlane_b32 s35, v61, 38
 ; SI-NEXT:    s_and_b32 s34, s34, 0xff
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, vcc_hi
 ; SI-NEXT:    s_lshl_b32 s35, s35, 8
 ; SI-NEXT:    s_addk_i32 vcc_lo, 0x300
 ; SI-NEXT:    s_or_b32 s34, s35, s34
-; SI-NEXT:    v_readlane_b32 s35, v61, 38
+; SI-NEXT:    v_readlane_b32 s35, v61, 37
 ; SI-NEXT:    s_add_i32 s35, s35, 3
-; SI-NEXT:    v_readlane_b32 s36, v61, 37
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s36, v61, 36
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, vcc_lo
 ; SI-NEXT:    s_and_b32 s35, s35, 0xff
@@ -182752,10 +182777,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s34
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s95
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, s94
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, s94
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, s93
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, s92
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, s90
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, s90
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, s91
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, s88
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, s79
@@ -182796,32 +182821,35 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, s25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, s22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, s21
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, s18
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, s15
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, s14
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, s13
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, s12
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, s11
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, s10
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, s9
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s8
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, s18
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, s15
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, s14
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, s13
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, s12
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, s11
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, s10
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, s9
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, s8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
 ; SI-NEXT:  .LBB93_3: ; %end
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; SI-NEXT:    v_readlane_b32 s99, v63, 35
 ; SI-NEXT:    v_readlane_b32 s98, v63, 34
 ; SI-NEXT:    v_readlane_b32 s97, v63, 33
 ; SI-NEXT:    v_readlane_b32 s96, v63, 32
@@ -182867,7 +182895,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -182877,7 +182905,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -182892,7 +182920,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v10
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
@@ -183031,48 +183059,45 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v44
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v42
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v45
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v47
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v57
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v58
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x74, v0
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v59
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v60
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x78, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -183103,74 +183128,73 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_mov_b32 s17, s19
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    s_mov_b32 s19, s50
+; SI-NEXT:    s_mov_b32 s19, s54
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    s_mov_b32 s23, s48
-; SI-NEXT:    s_mov_b32 s26, s37
-; SI-NEXT:    s_mov_b32 s28, s29
-; SI-NEXT:    s_mov_b32 s29, s76
-; SI-NEXT:    s_mov_b32 s59, s58
+; SI-NEXT:    s_mov_b32 s26, s50
+; SI-NEXT:    s_mov_b32 s23, s35
+; SI-NEXT:    s_mov_b32 s15, s75
+; SI-NEXT:    s_mov_b32 s20, s76
+; SI-NEXT:    s_mov_b32 s98, s59
+; SI-NEXT:    s_mov_b32 s58, s57
 ; SI-NEXT:    s_mov_b32 s56, s47
-; SI-NEXT:    s_mov_b32 s46, s41
+; SI-NEXT:    s_mov_b32 s41, s14
 ; SI-NEXT:    s_mov_b32 s12, s11
-; SI-NEXT:    s_mov_b32 s11, s7
-; SI-NEXT:    s_mov_b32 s7, s97
-; SI-NEXT:    s_mov_b32 s97, s81
-; SI-NEXT:    s_mov_b32 s81, s85
-; SI-NEXT:    s_mov_b32 s6, s40
-; SI-NEXT:    s_mov_b32 s40, s72
-; SI-NEXT:    s_mov_b32 s45, s73
-; SI-NEXT:    s_mov_b32 s15, s89
-; SI-NEXT:    s_mov_b32 s24, s98
-; SI-NEXT:    s_mov_b32 s20, s88
-; SI-NEXT:    s_mov_b32 s99, s55
+; SI-NEXT:    s_mov_b32 s7, s85
+; SI-NEXT:    s_mov_b32 s28, s29
+; SI-NEXT:    s_mov_b32 s29, s25
+; SI-NEXT:    s_mov_b32 s85, s97
+; SI-NEXT:    s_mov_b32 s97, s21
+; SI-NEXT:    s_mov_b32 s25, s69
+; SI-NEXT:    s_mov_b32 s69, s81
+; SI-NEXT:    s_mov_b32 s37, s66
+; SI-NEXT:    s_mov_b32 s66, s53
+; SI-NEXT:    s_mov_b32 s53, s34
+; SI-NEXT:    s_mov_b32 s34, s89
+; SI-NEXT:    s_mov_b32 s89, s93
+; SI-NEXT:    s_mov_b32 s44, s45
+; SI-NEXT:    s_mov_b32 s13, s39
+; SI-NEXT:    s_mov_b32 s88, s30
+; SI-NEXT:    s_mov_b32 s38, s95
+; SI-NEXT:    s_mov_b32 s94, s68
+; SI-NEXT:    s_mov_b32 s36, s96
+; SI-NEXT:    s_mov_b32 s31, s6
+; SI-NEXT:    v_readlane_b32 s6, v61, 34
+; SI-NEXT:    v_readlane_b32 s99, v61, 35
+; SI-NEXT:    v_readlane_b32 s55, v61, 33
+; SI-NEXT:    v_readlane_b32 s86, v61, 32
+; SI-NEXT:    v_readlane_b32 s96, v61, 30
+; SI-NEXT:    v_readlane_b32 s51, v61, 31
+; SI-NEXT:    v_readlane_b32 s83, v61, 28
+; SI-NEXT:    v_readlane_b32 s82, v61, 29
+; SI-NEXT:    v_readlane_b32 s84, v61, 26
+; SI-NEXT:    v_readlane_b32 s87, v61, 27
+; SI-NEXT:    v_readlane_b32 s80, v61, 25
+; SI-NEXT:    v_readlane_b32 s71, v61, 24
+; SI-NEXT:    v_readlane_b32 s49, v61, 22
+; SI-NEXT:    v_readlane_b32 s70, v61, 23
+; SI-NEXT:    v_readlane_b32 s65, v61, 20
+; SI-NEXT:    v_readlane_b32 s67, v61, 21
+; SI-NEXT:    v_readlane_b32 s54, v61, 18
+; SI-NEXT:    v_readlane_b32 s64, v61, 19
+; SI-NEXT:    v_readlane_b32 s50, v61, 17
+; SI-NEXT:    v_readlane_b32 s35, v61, 16
+; SI-NEXT:    v_readlane_b32 s52, v61, 14
+; SI-NEXT:    v_readlane_b32 s48, v61, 15
+; SI-NEXT:    v_readlane_b32 s30, v61, 12
+; SI-NEXT:    v_readlane_b32 s39, v61, 10
+; SI-NEXT:    v_readlane_b32 s92, v61, 11
+; SI-NEXT:    v_readlane_b32 s77, v61, 9
+; SI-NEXT:    v_readlane_b32 s75, v61, 13
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_readlane_b32 s75, v61, 7
-; SI-NEXT:    v_readlane_b32 s76, v61, 3
-; SI-NEXT:    v_readlane_b32 s77, v61, 5
-; SI-NEXT:    v_readlane_b32 s78, v61, 4
-; SI-NEXT:    v_readlane_b32 s92, v61, 6
-; SI-NEXT:    v_readlane_b32 s39, v61, 9
-; SI-NEXT:    v_readlane_b32 s37, v61, 8
-; SI-NEXT:    v_readlane_b32 s30, v61, 10
-; SI-NEXT:    v_readlane_b32 s48, v61, 11
-; SI-NEXT:    v_readlane_b32 s52, v61, 13
-; SI-NEXT:    v_readlane_b32 s35, v61, 12
-; SI-NEXT:    v_readlane_b32 s50, v61, 15
-; SI-NEXT:    v_readlane_b32 s64, v61, 14
-; SI-NEXT:    v_readlane_b32 s54, v61, 17
-; SI-NEXT:    v_readlane_b32 s67, v61, 16
-; SI-NEXT:    v_readlane_b32 s65, v61, 18
-; SI-NEXT:    v_readlane_b32 s70, v61, 19
-; SI-NEXT:    v_readlane_b32 s49, v61, 21
-; SI-NEXT:    v_readlane_b32 s71, v61, 20
-; SI-NEXT:    v_readlane_b32 s80, v61, 23
-; SI-NEXT:    v_readlane_b32 s83, v61, 22
-; SI-NEXT:    v_readlane_b32 s84, v61, 25
-; SI-NEXT:    v_readlane_b32 s82, v61, 24
-; SI-NEXT:    v_readlane_b32 s87, v61, 26
-; SI-NEXT:    v_readlane_b32 s86, v61, 27
-; SI-NEXT:    v_readlane_b32 s96, v61, 29
-; SI-NEXT:    v_readlane_b32 s51, v61, 28
-; SI-NEXT:    s_mov_b32 s55, s93
-; SI-NEXT:    s_mov_b32 s95, s91
-; SI-NEXT:    v_readlane_b32 s94, v61, 31
-; SI-NEXT:    s_mov_b32 s31, s90
-; SI-NEXT:    v_readlane_b32 s34, v61, 30
-; SI-NEXT:    v_readlane_b32 s53, v61, 32
-; SI-NEXT:    v_readlane_b32 s66, v61, 33
-; SI-NEXT:    v_readlane_b32 s68, v61, 34
-; SI-NEXT:    v_readlane_b32 s69, v61, 35
-; SI-NEXT:    v_readlane_b32 s8, v61, 36
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr15
@@ -183211,16 +183235,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
index 4a0bb6ceccd3f..e7eefafe31203 100644
--- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
+++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
@@ -41,24 +41,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_S32_SAVE $sgpr15, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_S32_SAVE $sgpr14, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr14_sgpr15 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
   ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
   ; CHECK-NEXT:   SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
   ; CHECK-NEXT:   renamable $sgpr56 = S_MOV_B32 0
   ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.5, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.5, align 4, addrspace 5)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec
   ; CHECK-NEXT:   renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
   ; CHECK-NEXT:   renamable $sgpr57 = S_MOV_B32 1083786240
-  ; CHECK-NEXT:   SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.17(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]]
@@ -67,9 +69,9 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.5(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr40 = COPY renamable $sgpr72
   ; CHECK-NEXT:   renamable $sgpr41 = COPY renamable $sgpr72
   ; CHECK-NEXT:   renamable $sgpr42 = COPY renamable $sgpr72
@@ -83,58 +85,58 @@ body:             |
   ; CHECK-NEXT:   renamable $sgpr50 = COPY renamable $sgpr72
   ; CHECK-NEXT:   renamable $sgpr51 = COPY killed renamable $sgpr72
   ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
-  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr56 = COPY killed renamable $sgpr72
-  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
   ; CHECK-NEXT:   renamable $sgpr52 = COPY renamable $sgpr56
   ; CHECK-NEXT:   renamable $sgpr53 = COPY killed renamable $sgpr76
   ; CHECK-NEXT:   renamable $sgpr56_sgpr57 = COPY renamable $sgpr52_sgpr53
   ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
-  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
   ; CHECK-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57
   ; CHECK-NEXT:   renamable $sgpr54 = COPY killed renamable $sgpr76
   ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
   ; CHECK-NEXT:   renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54
-  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50
   ; CHECK-NEXT:   renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
   ; CHECK-NEXT:   renamable $sgpr55 = COPY killed renamable $sgpr68
-  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr56 = COPY killed renamable $sgpr72
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr57 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr58 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr59 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr60 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr61 = COPY killed renamable $sgpr80
-  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr62 = COPY killed renamable $sgpr80
-  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr63 = COPY killed renamable $sgpr80
-  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr64 = COPY killed renamable $sgpr80
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr65 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr66 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr67 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr68 = COPY killed renamable $sgpr84
   ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
   ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
   ; CHECK-NEXT:   renamable $sgpr64 = COPY renamable $sgpr68
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr65 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr66 = COPY killed renamable $sgpr84
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr67 = COPY killed renamable $sgpr84
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
@@ -142,32 +144,30 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16
+  ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr60 = COPY killed renamable $sgpr14
-  ; CHECK-NEXT:   renamable $sgpr62 = COPY killed renamable $sgpr15
-  ; CHECK-NEXT:   SI_SPILL_S32_SAVE killed renamable $sgpr16, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr60 = COPY killed renamable $sgpr16
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, CustomRegMask($sgpr60,$sgpr62)
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.17(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr60, $sgpr62
+  ; CHECK-NEXT:   liveins: $sgpr60
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   $sgpr12 = COPY killed renamable $sgpr60
-  ; CHECK-NEXT:   $sgpr13 = COPY killed renamable $sgpr62
-  ; CHECK-NEXT:   $sgpr14 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5)
+  ; CHECK-NEXT:   $sgpr12 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
+  ; CHECK-NEXT:   $sgpr13 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5)
+  ; CHECK-NEXT:   $sgpr14 = COPY killed renamable $sgpr60
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_noregs, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   S_BRANCH %bb.17
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.6(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc
   ; CHECK-NEXT:   renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13
@@ -175,33 +175,33 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr4, 11, implicit-def $m0, implicit $m0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.9(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.10, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.9:
   ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.17(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec
   ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1)
-  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr18_sgpr19, implicit $exec
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr14_sgpr15, implicit $exec
   ; CHECK-NEXT:   dead renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
   ; CHECK-NEXT:   renamable $sgpr82 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -211,21 +211,17 @@ body:             |
   ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY renamable $sgpr70_sgpr71
   ; CHECK-NEXT:   renamable $sgpr80_sgpr81 = COPY killed renamable $sgpr10_sgpr11
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY renamable $sgpr80_sgpr81
-  ; CHECK-NEXT:   $sgpr12 = COPY renamable $sgpr14
-  ; CHECK-NEXT:   $sgpr13 = COPY renamable $sgpr15
+  ; CHECK-NEXT:   $sgpr12 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5)
+  ; CHECK-NEXT:   $sgpr13 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5)
   ; CHECK-NEXT:   renamable $sgpr84 = COPY killed renamable $sgpr8
   ; CHECK-NEXT:   renamable $sgpr33 = COPY killed renamable $sgpr16
-  ; CHECK-NEXT:   renamable $sgpr83 = COPY killed renamable $sgpr15
-  ; CHECK-NEXT:   renamable $sgpr85 = COPY killed renamable $sgpr14
-  ; CHECK-NEXT:   renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19
+  ; CHECK-NEXT:   renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr14_sgpr15
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9
-  ; CHECK-NEXT:   renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49
-  ; CHECK-NEXT:   renamable $sgpr14 = COPY killed renamable $sgpr85
-  ; CHECK-NEXT:   renamable $sgpr15 = COPY killed renamable $sgpr83
+  ; CHECK-NEXT:   renamable $sgpr14_sgpr15 = COPY killed renamable $sgpr48_sgpr49
   ; CHECK-NEXT:   renamable $sgpr16 = COPY killed renamable $sgpr33
   ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr68_sgpr69
   ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr70_sgpr71
@@ -238,49 +234,49 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.10:
   ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.12(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.8, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.12
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.11:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.17(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.17
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.12:
   ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.13(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.13:
   ; CHECK-NEXT:   successors: %bb.15(0x40000000), %bb.14(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.5, align 4, addrspace 5)
   ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.15, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.14
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.14:
   ; CHECK-NEXT:   successors: %bb.15(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.15:
   ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.16(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT:   liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5)
   ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.11, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.16:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.17(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16
+  ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
   ; CHECK-NEXT: {{  $}}



More information about the llvm-branch-commits mailing list